v1.0.3 (#14)

Browse files

- v1.0.3 (d09e9ea0a1c4e9113327bf844a847fbf9fb22a0c)

Files changed (11) hide show

config.json +3 -10
configuration_internlm.py +26 -8
generation_config.json +2 -1
modeling_internlm.py +90 -166
pytorch_model-00001-of-00005.bin → pytorch_model-00001-of-00006.bin +2 -2
pytorch_model-00002-of-00005.bin → pytorch_model-00002-of-00006.bin +2 -2
pytorch_model-00003-of-00005.bin → pytorch_model-00003-of-00006.bin +2 -2
pytorch_model-00004-of-00005.bin → pytorch_model-00004-of-00006.bin +2 -2
pytorch_model-00005-of-00006.bin +3 -0
pytorch_model-00005-of-00005.bin → pytorch_model-00006-of-00006.bin +1 -1
pytorch_model.bin.index.json +543 -543

config.json CHANGED Viewed

@@ -20,17 +20,10 @@
   "num_hidden_layers": 60,
   "num_key_value_heads": 40,
   "pad_token_id": 2,
-  "pretraining_tp": 1,
   "rms_norm_eps": 1e-06,
-  "rope_scaling": null,
-  "rope_theta": 10000.0,
   "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.33.1",
   "use_cache": true,
-  "vocab_size": 103168,
-  "rotary": {
-    "base": 10000,
-    "type": "dynamic"
-  }
 }

   "num_hidden_layers": 60,
   "num_key_value_heads": 40,
   "pad_token_id": 2,
   "rms_norm_eps": 1e-06,
   "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.33.2",
   "use_cache": true,
+  "vocab_size": 103168
 }

configuration_internlm.py CHANGED Viewed

@@ -19,8 +19,9 @@
 # limitations under the License.
 """ InternLM model configuration"""
-from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
@@ -29,9 +30,9 @@ INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class InternLMConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
-    an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -49,6 +50,19 @@ class InternLMConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the Transformer encoder.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to 2048):
@@ -80,13 +94,14 @@ class InternLMConfig(PretrainedConfig):
     model_type = "internlm"
     _auto_class = "AutoConfig"
-    def __init__(  # pylint: disable=W0102
         self,
         vocab_size=103168,
         hidden_size=4096,
         intermediate_size=11008,
         num_hidden_layers=32,
         num_attention_heads=32,
         hidden_act="silu",
         max_position_embeddings=2048,
         initializer_range=0.02,
@@ -97,7 +112,6 @@ class InternLMConfig(PretrainedConfig):
         eos_token_id=2,
         tie_word_embeddings=False,
         bias=True,
-        rotary={"base": 10000, "type": "dynamic"},  # pylint: disable=W0102
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -106,16 +120,20 @@ class InternLMConfig(PretrainedConfig):
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.bias = bias
-        self.rotary = rotary
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
-        )

 # limitations under the License.
 """ InternLM model configuration"""
 from transformers.utils import logging
+from transformers.configuration_utils import PretrainedConfig
 logger = logging.get_logger(__name__)
 class InternLMConfig(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate an InternLM
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the InternLM-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, *optional*, defaults to 32):
             Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        pretraining_tp (`int`, *optional*, defaults to `1`):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
         max_position_embeddings (`int`, *optional*, defaults to 2048):
     model_type = "internlm"
     _auto_class = "AutoConfig"
+    def __init__(
         self,
         vocab_size=103168,
         hidden_size=4096,
         intermediate_size=11008,
         num_hidden_layers=32,
         num_attention_heads=32,
+        num_key_value_heads=None,
         hidden_act="silu",
         max_position_embeddings=2048,
         initializer_range=0.02,
         eos_token_id=2,
         tie_word_embeddings=False,
         bias=True,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.bias = bias
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
+        )

generation_config.json CHANGED Viewed

@@ -2,5 +2,6 @@
   "_from_model_config": true,
   "bos_token_id": 1,
   "eos_token_id": 2,
-  "transformers_version": "4.33.1"
 }

   "_from_model_config": true,
   "bos_token_id": 1,
   "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.33.2"
 }

modeling_internlm.py CHANGED Viewed

@@ -19,36 +19,26 @@
 # limitations under the License.
 """ PyTorch InternLM model."""
 import math
-import queue
-import threading
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
-from transformers.generation.streamers import BaseStreamer
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
 from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
 from .configuration_internlm import InternLMConfig
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "InternLMConfig"
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
@@ -81,10 +71,17 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 class InternLMRMSNorm(nn.Module):
-    """RMSNorm implemention."""
     def __init__(self, hidden_size, eps=1e-6):
         """
         InternLMRMSNorm is equivalent to T5LayerNorm
@@ -105,14 +102,6 @@ class InternLMRMSNorm(nn.Module):
 class InternLMRotaryEmbedding(torch.nn.Module):
-    """Implement InternLM's rotary embedding.
-    Args:
-        dim (int): Characteristic dimension of each self-attentional head.
-        max_position_embeddings (int, optional): Model's training length. Defaults to 2048.
-        base (int, optional): The rotation position encodes the rotation Angle base number. Defaults to 10000.
-        device (Any, optional): Running device. Defaults to None.
-    """
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
@@ -144,66 +133,6 @@ class InternLMRotaryEmbedding(torch.nn.Module):
         )
-class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
-    """Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
-    Args:
-        dim (int): Characteristic dimension of each self-attentional head.
-        max_position_embeddings (int, optional): Model's training length. Defaults to 2048.
-        base (int, optional): The rotation position encodes the rotation Angle base number. Defaults to 10000.
-        device (Any, optional): Running device. Defaults to None.
-        scaling_factor (float, optional): NTK method extrapolation coefficient. Defaults to 1.0.
-    """
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.dim = dim
-        self.base = base
-        self.scaling_factor = scaling_factor
-        # Build here to make `torch.jit.trace` work.
-        self.max_position_embeddings = max_position_embeddings
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
-    def _update_cached(self, x, seq_len=None):
-        self.max_seq_len_cached = max(seq_len, self.max_position_embeddings)
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
-        else:
-            inv_freq = self.inv_freq
-        t = torch.arange(self.max_seq_len_cached, device=inv_freq.device, dtype=inv_freq.dtype)
-        freqs = torch.einsum("i,j->ij", t, inv_freq)
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
-        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
-        if seq_len <= self.max_position_embeddings:
-            # Reset the tables if the sequence length has changed,
-            if self.max_seq_len_cached > self.max_position_embeddings:
-                self._update_cached(x, seq_len)
-        else:
-            self._update_cached(x, seq_len)
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -215,18 +144,10 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos.unsqueeze(0).unsqueeze(0).expand(len(position_ids), -1, -1, -1)
-    sin = sin.unsqueeze(0).unsqueeze(0).expand(len(position_ids), -1, -1, -1)
-    if q.size(2) == 1:
-        q_embed = (q * cos[:, :, -1, :]) + (rotate_half(q) * sin[:, :, -1, :])
-    else:
-        q_embed = (q * cos) + (rotate_half(q) * sin)
-    if k.size(2) == 1:
-        k_embed = (k * cos[:, :, -1, :]) + (rotate_half(k) * sin[:, :, -1, :])
-    else:
-        k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -256,6 +177,8 @@ class InternLMAttention(nn.Module):
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.max_position_embeddings = config.max_position_embeddings
         if (self.head_dim * self.num_heads) != self.hidden_size:
@@ -264,28 +187,10 @@ class InternLMAttention(nn.Module):
                 f" and `num_heads`: {self.num_heads})."
             )
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
-        self.rotary_emb = self._init_rope()
-    def _init_rope(self):
-        if self.config.rotary["type"] == "origin":
-            self.rotary_emb = InternLMRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.config.rotary["base"],
-            )
-        elif self.config.rotary["type"] == "dynamic":
-            self.rotary_emb = InternLMDynamicNTKScalingRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.config.rotary["base"],
-                scaling_factor=self.config.rotary.get("scaling_factor", 1.0),
-            )
-        else:
-            raise ValueError("Currently we only support rotary embedding's type being one of ('origin', 'dynamic').")
-        return self.rotary_emb
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -302,20 +207,25 @@ class InternLMAttention(nn.Module):
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         if past_key_value is not None:
             # reuse k, v, self_attention
             key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        # print(use_cache)
         past_key_value = (key_states, value_states) if use_cache else None
-        kv_seq_len = key_states.shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
@@ -426,9 +336,11 @@ INTERNLM_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`InternLMConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -469,34 +381,44 @@ INTERNLM_INPUTS_DOCSTRING = r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
-            when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
             `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
@@ -525,10 +447,10 @@ INTERNLM_INPUTS_DOCSTRING = r"""
 class InternLMModel(InternLMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
     Args:
         config: InternLMConfig
     """
     _auto_class = "AutoModel"
     def __init__(self, config: InternLMConfig):
@@ -754,14 +676,20 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, InternLMForCausalLM
         >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you consciours? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -851,56 +779,50 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
         for layer_past in past_key_values:
             reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
         return reordered_past
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = []):
         prompt = ""
         for record in history:
             prompt += f"""<|User|>:{record[0]}<eoh>\n<|Bot|>:{record[1]}<eoa>\n"""
         prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
-    def chat(
-        self,
-        tokenizer,
-        query: str,
-        history: List[Tuple[str, str]] = [],
-        streamer: Optional[BaseStreamer] = None,
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        **kwargs,
-    ):
         inputs = self.build_inputs(tokenizer, query, history)
         inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
-        outputs = self.generate(
-            **inputs,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
-            do_sample=do_sample,
-            temperature=temperature,
-            top_p=top_p,
-            **kwargs,
-        )
-        outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
         response = tokenizer.decode(outputs, skip_special_tokens=True)
         response = response.split("<eoa>")[0]
         history = history + [(query, response)]
         return response, history
     @torch.no_grad()
-    def stream_chat(
-        self,
-        tokenizer,
-        query: str,
-        history: List[Tuple[str, str]] = [],
-        max_new_tokens: int = 1024,
-        do_sample: bool = True,
-        temperature: float = 0.8,
-        top_p: float = 0.8,
-        **kwargs,
-    ):
         """
         Return a generator in format: (response, history)
         Eg.
@@ -946,12 +868,12 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
                 tokenizer=tokenizer,
                 query=query,
                 streamer=ChatStreamer(tokenizer=tokenizer),
-                history=history,
                 max_new_tokens=max_new_tokens,
                 do_sample=do_sample,
                 temperature=temperature,
                 top_p=top_p,
-                **kwargs,
             )
         def consumer():
@@ -969,8 +891,10 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
 @add_start_docstrings(
     """
     The InternLM Model transformer with a sequence classification head on top (linear layer).
     [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the

 # limitations under the License.
 """ PyTorch InternLM model."""
 import math
 from typing import List, Optional, Tuple, Union
+import threading, queue
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from transformers.modeling_utils import PreTrainedModel
+from transformers.generation.streamers import BaseStreamer
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_internlm import InternLMConfig
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "InternLMConfig"
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class InternLMRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
         InternLMRMSNorm is equivalent to T5LayerNorm
 class InternLMRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
         )
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
         if (self.head_dim * self.num_heads) != self.hidden_size:
                 f" and `num_heads`: {self.num_heads})."
             )
         self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
+        self.rotary_emb = InternLMRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
         if past_key_value is not None:
             # reuse k, v, self_attention
             key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
         past_key_value = (key_states, value_states) if use_cache else None
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`InternLMConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
             `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
 class InternLMModel(InternLMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
     Args:
         config: InternLMConfig
     """
     _auto_class = "AutoModel"
     def __init__(self, config: InternLMConfig):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, InternLMForCausalLM
         >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you consciours? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         for layer_past in past_key_values:
             reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
         return reordered_past
     def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = []):
         prompt = ""
         for record in history:
             prompt += f"""<|User|>:{record[0]}<eoh>\n<|Bot|>:{record[1]}<eoa>\n"""
         prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
+    def chat(self,
+             tokenizer,
+             query: str,
+             history: List[Tuple[str, str]] = [],
+             streamer: Optional[BaseStreamer] = None,
+             max_new_tokens: int = 1024,
+             do_sample: bool = True,
+             temperature: float = 0.8,
+             top_p: float = 0.8,
+             **kwargs):
         inputs = self.build_inputs(tokenizer, query, history)
         inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
+        outputs = self.generate(**inputs,
+                                streamer=streamer,
+                                max_new_tokens=max_new_tokens,
+                                do_sample=do_sample,
+                                temperature=temperature,
+                                top_p=top_p,
+                                **kwargs)
+        outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]):]
         response = tokenizer.decode(outputs, skip_special_tokens=True)
         response = response.split("<eoa>")[0]
         history = history + [(query, response)]
         return response, history
     @torch.no_grad()
+    def stream_chat(self,
+                    tokenizer,
+                    query: str,
+                    history: List[Tuple[str, str]] = [],
+                    max_new_tokens: int = 1024,
+                    do_sample: bool = True,
+                    temperature: float = 0.8,
+                    top_p: float = 0.8,
+                    **kwargs):
         """
         Return a generator in format: (response, history)
         Eg.
                 tokenizer=tokenizer,
                 query=query,
                 streamer=ChatStreamer(tokenizer=tokenizer),
+                history=history,
                 max_new_tokens=max_new_tokens,
                 do_sample=do_sample,
                 temperature=temperature,
                 top_p=top_p,
+                **kwargs
             )
         def consumer():
 @add_start_docstrings(
     """
     The InternLM Model transformer with a sequence classification head on top (linear layer).
     [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the

pytorch_model-00001-of-00005.bin → pytorch_model-00001-of-00006.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aeba743507872c45e7cf951d7996bce448d8deada841d055d2ac03948af0c2b7
-size 9990647029

 version https://git-lfs.github.com/spec/v1
+oid sha256:9c989b1624a481672a7018455d7ff95398ded2a07698ccf2687877db91baf254
+size 7893395149

pytorch_model-00002-of-00005.bin → pytorch_model-00002-of-00006.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a11c8737fce8d6be9a8f6eb0faa44016c94813aed1d50a757ca32abece4ed461
-size 9956594199

 version https://git-lfs.github.com/spec/v1
+oid sha256:11c9b3fc955587d5ea525c787d7677602e0f3d70131259b3c12079e034e68132
+size 7964241876

pytorch_model-00003-of-00005.bin → pytorch_model-00003-of-00006.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5c64167ce104e9a576da50f89a398ac2124734621c45e12ea0addbac99ad87ac
-size 9867486361

 version https://git-lfs.github.com/spec/v1
+oid sha256:366595a002cc2ce217aec0c4885e7c5f840df751155f9e2510e6472e171c02d2
+size 7896062197

pytorch_model-00004-of-00005.bin → pytorch_model-00004-of-00006.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:40e22421695e3206bc85f0a4839641370bc8277ab689ff0e5d75e708d51f8691
-size 9306483281

 version https://git-lfs.github.com/spec/v1
+oid sha256:508cfed19500ecf7678f1680a47a1073b73f8ad5597612c094c8b2e7df8d3931
+size 7964241876

pytorch_model-00005-of-00006.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4182de7c0df21447a5b7ed8cbb68162e85be061f23ddab73b8be157049fb9e31
+size 7403239886

pytorch_model-00005-of-00005.bin → pytorch_model-00006-of-00006.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:263f29c6331d8951fd454d4bbd2991d422bbcfb5b07d4acbb0e75aaf53b1a76c
 size 1056441258

 version https://git-lfs.github.com/spec/v1
+oid sha256:89b0631c7069213a49dd8a3cb9012e52d82ac9328bd96e2bba8383d825720039
 size 1056441258

pytorch_model.bin.index.json CHANGED Viewed

@@ -3,548 +3,548 @@
     "total_size": 40177428480
   },
   "weight_map": {
-    "lm_head.weight": "pytorch_model-00005-of-00005.bin",
-    "model.embed_tokens.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.18.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.29.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
-    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.31.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.32.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.32.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.32.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.33.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.33.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.33.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.34.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.34.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.34.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.35.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.35.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.35.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.36.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.36.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.36.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.37.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.37.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.37.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.38.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.38.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.38.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.39.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.39.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.39.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.40.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.40.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.40.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.41.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.41.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.41.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.42.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.42.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.42.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.43.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.43.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.43.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.44.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.44.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.44.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.45.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.45.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.45.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
-    "model.layers.46.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.46.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.46.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.47.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.47.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.47.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.48.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.48.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.48.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.49.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.49.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.49.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.50.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.50.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.50.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.51.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.51.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.51.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.52.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.52.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.52.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.52.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.52.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.52.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.52.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.52.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.52.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.53.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.53.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.53.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.53.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.53.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.53.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.53.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.53.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.53.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.54.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.54.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.54.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.54.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.54.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.54.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.54.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.54.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.54.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.55.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.55.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.55.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.55.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.55.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.55.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.55.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.55.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.55.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.56.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.56.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.56.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.56.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.56.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.56.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.56.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.56.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.56.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.57.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.57.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.57.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.57.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.57.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.57.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.57.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.57.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.57.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.58.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.58.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.58.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.58.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.58.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.58.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.58.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.58.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.58.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.59.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.59.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.59.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.59.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.59.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.59.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.59.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.59.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.59.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
-    "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
-    "model.norm.weight": "pytorch_model-00004-of-00005.bin"
   }
 }

     "total_size": 40177428480
   },
   "weight_map": {
+    "lm_head.weight": "pytorch_model-00006-of-00006.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.26.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.26.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.27.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.27.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.28.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.28.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.29.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.29.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.31.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.32.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.32.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.32.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.33.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.33.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.33.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.34.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.34.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.34.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.35.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.35.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.35.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
+    "model.layers.36.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.36.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.36.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.37.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.37.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.37.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.38.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.38.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.38.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.39.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.39.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.39.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.40.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.40.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.40.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.41.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.41.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.41.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.42.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.42.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.42.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.43.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.43.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.43.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.44.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.44.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.44.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.45.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.45.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.45.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.46.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.46.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.46.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.47.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.47.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.47.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.48.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.48.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.48.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
+    "model.layers.49.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.49.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.49.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.50.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.50.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.50.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.51.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.51.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.51.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.52.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.52.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.52.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.52.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.52.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.52.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.52.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.52.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.52.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.53.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.53.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.53.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.53.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.53.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.53.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.53.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.53.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.53.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.54.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.54.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.54.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.54.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.54.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.54.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.54.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.54.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.54.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.55.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.55.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.55.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.55.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.55.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.55.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.55.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.55.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.55.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.56.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.56.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.56.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.56.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.56.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.56.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.56.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.56.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.56.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.57.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.57.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.57.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.57.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.57.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.57.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.57.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.57.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.57.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.58.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.58.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.58.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.58.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.58.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.58.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.58.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.58.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.58.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.59.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.59.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.59.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.59.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.59.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.59.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.59.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.59.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.59.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
+    "model.norm.weight": "pytorch_model-00005-of-00006.bin"
   }
 }