v1.0.3

#13

by vansin - opened Dec 11, 2023

base: refs/heads/main

←

from: refs/pr/13

Discussion Files changed

+643

-848

This PR is in draft mode

Files changed (12) hide show

config.json +10 -3
configuration_internlm.py +12 -7
generation_config.json +1 -2
modeling_internlm.py +53 -271
pytorch_model-00001-of-00006.bin → pytorch_model-00001-of-00005.bin +2 -2
pytorch_model-00002-of-00006.bin → pytorch_model-00002-of-00005.bin +2 -2
pytorch_model-00003-of-00006.bin → pytorch_model-00003-of-00005.bin +2 -2
pytorch_model-00004-of-00006.bin → pytorch_model-00004-of-00005.bin +2 -2
pytorch_model-00006-of-00006.bin → pytorch_model-00005-of-00005.bin +1 -1
pytorch_model-00005-of-00006.bin +0 -3
pytorch_model.bin.index.json +543 -543
tokenization_internlm.py +9 -4

config.json CHANGED Viewed

@@ -20,10 +20,17 @@
   "num_hidden_layers": 60,
   "num_key_value_heads": 40,
   "pad_token_id": 2,
   "rms_norm_eps": 1e-06,
   "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.33.2",
   "use_cache": true,
-  "vocab_size": 103168
 }

   "num_hidden_layers": 60,
   "num_key_value_heads": 40,
   "pad_token_id": 2,
+  "pretraining_tp": 1,
   "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
   "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.33.1",
   "use_cache": true,
+  "vocab_size": 103168,
+  "rotary": {
+    "base": 10000,
+    "type": "dynamic"
+  }
 }

configuration_internlm.py CHANGED Viewed

@@ -1,7 +1,10 @@
 # coding=utf-8
-# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
-# This code is based on transformers/src/transformers/models/llama/configuration_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,14 +27,16 @@ logger = logging.get_logger(__name__)
 INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
-# Modified from transformers.model.llama.configuration_llama.LlamaConfig
 class InternLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
     an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
             Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
@@ -59,12 +64,16 @@ class InternLMConfig(PretrainedConfig):
         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         Example:
     ```python
     >>> from transformers import InternLMModel, InternLMConfig
     >>> # Initializing a InternLM internlm-7b style configuration
     >>> configuration = InternLMConfig()
     >>> # Initializing a model from the internlm-7b style configuration
     >>> model = InternLMModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
@@ -89,7 +98,6 @@ class InternLMConfig(PretrainedConfig):
         tie_word_embeddings=False,
         bias=True,
         rotary={"base": 10000, "type": "dynamic"},  # pylint: disable=W0102
-        attn_implementation="eager",
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -104,9 +112,6 @@ class InternLMConfig(PretrainedConfig):
         self.use_cache = use_cache
         self.bias = bias
         self.rotary = rotary
-        self.attn_implementation = attn_implementation
-        if self.attn_implementation is None:
-            self.attn_implementation = "eager"
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

 # coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class InternLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
     an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
         vocab_size (`int`, *optional*, defaults to 32000):
             Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
         tie_word_embeddings(`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
         Example:
     ```python
     >>> from transformers import InternLMModel, InternLMConfig
     >>> # Initializing a InternLM internlm-7b style configuration
     >>> configuration = InternLMConfig()
     >>> # Initializing a model from the internlm-7b style configuration
     >>> model = InternLMModel(configuration)
     >>> # Accessing the model configuration
     >>> configuration = model.config
     ```"""
         tie_word_embeddings=False,
         bias=True,
         rotary={"base": 10000, "type": "dynamic"},  # pylint: disable=W0102
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.use_cache = use_cache
         self.bias = bias
         self.rotary = rotary
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,

generation_config.json CHANGED Viewed

@@ -2,6 +2,5 @@
   "_from_model_config": true,
   "bos_token_id": 1,
   "eos_token_id": 2,
-  "pad_token_id": 2,
-  "transformers_version": "4.33.2"
 }

   "_from_model_config": true,
   "bos_token_id": 1,
   "eos_token_id": 2,
+  "transformers_version": "4.33.1"
 }

modeling_internlm.py CHANGED Viewed

@@ -1,6 +1,10 @@
-# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
-# This code is based on transformers/src/transformers/models/llama/modeling_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,6 +28,7 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -37,44 +42,14 @@ from transformers.utils import (
     replace_return_docstrings,
 )
-try:
-    from transformers.generation.streamers import BaseStreamer
-except:  # noqa # pylint: disable=bare-except
-    BaseStreamer = None
 from .configuration_internlm import InternLMConfig
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "InternLMConfig"
-flash_attn_func, flash_attn_varlen_func = None, None
-pad_input, index_first_axis, unpad_input = None, None, None
-def _import_flash_attn():
-    global flash_attn_func, flash_attn_varlen_func
-    global pad_input, index_first_axis, unpad_input
-    try:
-        from flash_attn import flash_attn_func as _flash_attn_func, flash_attn_varlen_func as _flash_attn_varlen_func
-        from flash_attn.bert_padding import pad_input as _pad_input, index_first_axis as _index_first_axis, unpad_input as _unpad_input
-        flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func
-        pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input
-    except ImportError:
-        raise ImportError("flash_attn is not installed.")
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = nn.functional.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-# Copied from transformers.models.llama.modeling_llama._make_causal_mask
 def _make_causal_mask(
     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
@@ -92,7 +67,7 @@ def _make_causal_mask(
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
-# Copied from transformers.models.llama.modeling_llama._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -107,7 +82,6 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
-# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM
 class InternLMRMSNorm(nn.Module):
     """RMSNorm implemention."""
@@ -130,7 +104,6 @@ class InternLMRMSNorm(nn.Module):
         return self.weight * hidden_states
-# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM
 class InternLMRotaryEmbedding(torch.nn.Module):
     """Implement InternLM's rotary embedding.
@@ -140,7 +113,6 @@ class InternLMRotaryEmbedding(torch.nn.Module):
         base (int, optional): The rotation position encodes the rotation Angle base number. Defaults to 10000.
         device (Any, optional): Running device. Defaults to None.
     """
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
@@ -152,8 +124,8 @@ class InternLMRotaryEmbedding(torch.nn.Module):
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(torch.float32), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(torch.float32), persistent=False)
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
@@ -164,15 +136,14 @@ class InternLMRotaryEmbedding(torch.nn.Module):
             freqs = torch.einsum("i,j->ij", t, self.inv_freq)
             # Different from paper, but it uses a different permutation in order to obtain the same calculation
             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer("cos_cached", emb.cos(), persistent=False)
-            self.register_buffer("sin_cached", emb.sin(), persistent=False)
         return (
-            self.cos_cached[:seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:seq_len, ...].to(dtype=x.dtype),
         )
-# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM
 class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
     """Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
@@ -187,7 +158,7 @@ class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.dim = dim
         self.base = base
         self.scaling_factor = scaling_factor
@@ -199,8 +170,8 @@ class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos(), persistent=False)
-        self.register_buffer("sin_cached", emb.sin(), persistent=False)
     def _update_cached(self, x, seq_len=None):
         self.max_seq_len_cached = max(seq_len, self.max_position_embeddings)
@@ -214,8 +185,8 @@ class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
         t = torch.arange(self.max_seq_len_cached, device=inv_freq.device, dtype=inv_freq.dtype)
         freqs = torch.einsum("i,j->ij", t, inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos(), persistent=False)
-        self.register_buffer("sin_cached", emb.sin(), persistent=False)
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
@@ -228,12 +199,11 @@ class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
             self._update_cached(x, seq_len)
         return (
-            self.cos_cached[:seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:seq_len, ...].to(dtype=x.dtype),
         )
-# Copied from transformers.model.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -241,28 +211,25 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
-    if position_ids.size(1) == 1:
-        q_cos = cos[position_ids].unsqueeze(1).expand(q.shape)
-        q_sin = sin[position_ids].unsqueeze(1).expand(q.shape)
-        q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
-        position_ids = position_ids.flatten() + 1
-        max_length = max(position_ids)
-        position_ids = torch.stack([torch.cat([torch.ones(max_length - w, dtype=torch.long), torch.arange(w)]) for w in position_ids])
-        k_cos = cos[position_ids].unsqueeze(1).expand(k.shape)
-        k_sin = sin[position_ids].unsqueeze(1).expand(k.shape)
-        k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
     else:
-        cos = cos[position_ids].unsqueeze(1)
-        sin = sin[position_ids].unsqueeze(1)
         q_embed = (q * cos) + (rotate_half(q) * sin)
         k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
-# Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->InternLM
 class InternLMMLP(nn.Module):
     def __init__(
         self,
@@ -280,7 +247,6 @@ class InternLMMLP(nn.Module):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->InternLM
 class InternLMAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -302,7 +268,6 @@ class InternLMAttention(nn.Module):
         self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
         self.rotary_emb = self._init_rope()
-        self.is_causal = True
     def _init_rope(self):
         if self.config.rotary["type"] == "origin":
@@ -345,6 +310,7 @@ class InternLMAttention(nn.Module):
             key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
         past_key_value = (key_states, value_states) if use_cache else None
         kv_seq_len = key_states.shape[-2]
@@ -387,163 +353,12 @@ class InternLMAttention(nn.Module):
         return attn_output, attn_weights, past_key_value
-# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->InternLM
-class InternLMFlashAttention2(InternLMAttention):
-    """
-    InternLM flash attention module. This module inherits from `InternLMAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # InternLMFlashAttention2 attention does not support output_attentions
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        past_key_value = (key_states, value_states) if use_cache else None
-        kv_seq_len = key_states.shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len
-        )
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-    def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        # Contains at least one padding token in the sequence
-        causal = self.is_causal and query_length != 1
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
-            )
-        return attn_output
-    def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-        )
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
-        )
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q.to(torch.int64),
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-INTERNLM_ATTENTION_CLASSES = {
-    "eager": InternLMAttention,
-    "flash_attention_2": InternLMFlashAttention2,
-}
-# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->InternLM
 class InternLMDecoderLayer(nn.Module):
     def __init__(self, config: InternLMConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = INTERNLM_ATTENTION_CLASSES[config.attn_implementation](config=config)
         self.mlp = InternLMMLP(
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
@@ -622,7 +437,6 @@ INTERNLM_START_DOCSTRING = r"""
 """
-# Copied from transformers.models.llama.modeling_llama.LlamaPretrainedModel with Llama->InternLM
 @add_start_docstrings(
     "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
     INTERNLM_START_DOCSTRING,
@@ -704,7 +518,6 @@ INTERNLM_INPUTS_DOCSTRING = r"""
 """
-# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->InternLM
 @add_start_docstrings(
     "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
     INTERNLM_START_DOCSTRING,
@@ -722,10 +535,8 @@ class InternLMModel(InternLMPreTrainedModel):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
-        self.config = config
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList([InternLMDecoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.norm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -784,9 +595,6 @@ class InternLMModel(InternLMPreTrainedModel):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if self.config.attn_implementation == "flash_attention_2":
-            _import_flash_attn()
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
@@ -815,16 +623,14 @@ class InternLMModel(InternLMPreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        if self.config.attn_implementation == "flash_attention_2":
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        else:
-            if attention_mask is None:
-                attention_mask = torch.ones(
-                    (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
-                )
-            attention_mask = self._prepare_decoder_attention_mask(
-                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
             )
         hidden_states = inputs_embeds
@@ -897,7 +703,6 @@ class InternLMModel(InternLMPreTrainedModel):
         )
-# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->InternLM
 class InternLMForCausalLM(InternLMPreTrainedModel):
     _auto_class = "AutoModelForCausalLM"
@@ -950,7 +755,6 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, InternLMForCausalLM
@@ -962,9 +766,7 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
-        ```
-        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1049,17 +851,12 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
         for layer_past in past_key_values:
             reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
         return reordered_past
-    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
-        if tokenizer.add_bos_token:
-            prompt = ""
-        else:
-            prompt = tokenizer.bos_token
-        if meta_instruction:
-            prompt += f"""<|System|>:{meta_instruction}\n"""
         for record in history:
-            prompt += f"""<|User|>:{record[0]}\n<|Bot|>:{record[1]}<eoa>\n"""
-        prompt += f"""<|User|>:{query}\n<|Bot|>:"""
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
@@ -1073,12 +870,9 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
         do_sample: bool = True,
         temperature: float = 0.8,
         top_p: float = 0.8,
-        meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
-"- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
-"- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.",
         **kwargs,
     ):
-        inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
         inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
         outputs = self.generate(
             **inputs,
@@ -1113,11 +907,6 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
         ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')])
         ('你好，有什么可以帮助您的吗？', [('你好', '你好，有什么可以帮助您的吗？')])
         """
-        if BaseStreamer is None:
-            raise ModuleNotFoundError(
-                "The version of `transformers` is too low. Please make sure "
-                "that you have installed `transformers>=4.28.0`."
-            )
         response_queue = queue.Queue(maxsize=20)
@@ -1129,7 +918,6 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
                 self.query = query
                 self.history = history
                 self.response = ""
-                self.cache = []
                 self.received_inputs = False
                 self.queue.put((self.response, history + [(self.query, self.response)]))
@@ -1144,17 +932,11 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
                     self.received_inputs = True
                     return
-                self.cache.extend(value.tolist())
-                token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
-                if "�" in token and len(token) <= 5:
-                    return
                 if token.strip() != "<eoa>":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))
-                    self.cache = []
-                else:
-                    self.end()
             def end(self):
                 self.queue.put(None)
@@ -1301,4 +1083,4 @@ class InternLMForSequenceClassification(InternLMPreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
+from transformers.generation.streamers import BaseStreamer
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     replace_return_docstrings,
 )
 from .configuration_internlm import InternLMConfig
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "InternLMConfig"
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
     input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 ):
     return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
     return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 class InternLMRMSNorm(nn.Module):
     """RMSNorm implemention."""
         return self.weight * hidden_states
 class InternLMRotaryEmbedding(torch.nn.Module):
     """Implement InternLM's rotary embedding.
         base (int, optional): The rotation position encodes the rotation Angle base number. Defaults to 10000.
         device (Any, optional): Running device. Defaults to None.
     """
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
             freqs = torch.einsum("i,j->ij", t, self.inv_freq)
             # Different from paper, but it uses a different permutation in order to obtain the same calculation
             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
         return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
         )
 class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
     """Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
         super().__init__()
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
         self.dim = dim
         self.base = base
         self.scaling_factor = scaling_factor
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
     def _update_cached(self, x, seq_len=None):
         self.max_seq_len_cached = max(seq_len, self.max_position_embeddings)
         t = torch.arange(self.max_seq_len_cached, device=inv_freq.device, dtype=inv_freq.dtype)
         freqs = torch.einsum("i,j->ij", t, inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
     def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
             self._update_cached(x, seq_len)
         return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
         )
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     return torch.cat((-x2, x1), dim=-1)
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos.unsqueeze(0).unsqueeze(0).expand(len(position_ids), -1, -1, -1)
+    sin = sin.unsqueeze(0).unsqueeze(0).expand(len(position_ids), -1, -1, -1)
+    if q.size(2) == 1:
+        q_embed = (q * cos[:, :, -1, :]) + (rotate_half(q) * sin[:, :, -1, :])
     else:
         q_embed = (q * cos) + (rotate_half(q) * sin)
+    if k.size(2) == 1:
+        k_embed = (k * cos[:, :, -1, :]) + (rotate_half(k) * sin[:, :, -1, :])
+    else:
         k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 class InternLMMLP(nn.Module):
     def __init__(
         self,
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 class InternLMAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
         self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
         self.rotary_emb = self._init_rope()
     def _init_rope(self):
         if self.config.rotary["type"] == "origin":
             key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        # print(use_cache)
         past_key_value = (key_states, value_states) if use_cache else None
         kv_seq_len = key_states.shape[-2]
         return attn_output, attn_weights, past_key_value
 class InternLMDecoderLayer(nn.Module):
     def __init__(self, config: InternLMConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = InternLMAttention(config=config)
         self.mlp = InternLMMLP(
             hidden_size=self.hidden_size,
             intermediate_size=config.intermediate_size,
 """
 @add_start_docstrings(
     "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
     INTERNLM_START_DOCSTRING,
 """
 @add_start_docstrings(
     "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
     INTERNLM_START_DOCSTRING,
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.layers = nn.ModuleList([InternLMDecoderLayer(config) for _ in range(config.num_hidden_layers)])
         self.norm = InternLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
             )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
         hidden_states = inputs_embeds
         )
 class InternLMForCausalLM(InternLMPreTrainedModel):
     _auto_class = "AutoModelForCausalLM"
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, InternLMForCausalLM
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
         for layer_past in past_key_values:
             reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
         return reordered_past
+    def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = []):
+        prompt = ""
         for record in history:
+            prompt += f"""<|User|>:{record[0]}<eoh>\n<|Bot|>:{record[1]}<eoa>\n"""
+        prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
         return tokenizer([prompt], return_tensors="pt")
     @torch.no_grad()
         do_sample: bool = True,
         temperature: float = 0.8,
         top_p: float = 0.8,
         **kwargs,
     ):
+        inputs = self.build_inputs(tokenizer, query, history)
         inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
         outputs = self.generate(
             **inputs,
         ('你好，有什么可以帮助您的吗', [('你好', '你好，有什么可以帮助您的吗')])
         ('你好，有什么可以帮助您的吗？', [('你好', '你好，有什么可以帮助您的吗？')])
         """
         response_queue = queue.Queue(maxsize=20)
                 self.query = query
                 self.history = history
                 self.response = ""
                 self.received_inputs = False
                 self.queue.put((self.response, history + [(self.query, self.response)]))
                     self.received_inputs = True
                     return
+                token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
                 if token.strip() != "<eoa>":
                     self.response = self.response + token
                     history = self.history + [(self.query, self.response)]
                     self.queue.put((self.response, history))
             def end(self):
                 self.queue.put(None)
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )

pytorch_model-00001-of-00006.bin → pytorch_model-00001-of-00005.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c989b1624a481672a7018455d7ff95398ded2a07698ccf2687877db91baf254
-size 7893395149

 version https://git-lfs.github.com/spec/v1
+oid sha256:aeba743507872c45e7cf951d7996bce448d8deada841d055d2ac03948af0c2b7
+size 9990647029

pytorch_model-00002-of-00006.bin → pytorch_model-00002-of-00005.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:11c9b3fc955587d5ea525c787d7677602e0f3d70131259b3c12079e034e68132
-size 7964241876

 version https://git-lfs.github.com/spec/v1
+oid sha256:a11c8737fce8d6be9a8f6eb0faa44016c94813aed1d50a757ca32abece4ed461
+size 9956594199

pytorch_model-00003-of-00006.bin → pytorch_model-00003-of-00005.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:366595a002cc2ce217aec0c4885e7c5f840df751155f9e2510e6472e171c02d2
-size 7896062197

 version https://git-lfs.github.com/spec/v1
+oid sha256:5c64167ce104e9a576da50f89a398ac2124734621c45e12ea0addbac99ad87ac
+size 9867486361

pytorch_model-00004-of-00006.bin → pytorch_model-00004-of-00005.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:508cfed19500ecf7678f1680a47a1073b73f8ad5597612c094c8b2e7df8d3931
-size 7964241876

 version https://git-lfs.github.com/spec/v1
+oid sha256:40e22421695e3206bc85f0a4839641370bc8277ab689ff0e5d75e708d51f8691
+size 9306483281

pytorch_model-00006-of-00006.bin → pytorch_model-00005-of-00005.bin RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89b0631c7069213a49dd8a3cb9012e52d82ac9328bd96e2bba8383d825720039
 size 1056441258

 version https://git-lfs.github.com/spec/v1
+oid sha256:263f29c6331d8951fd454d4bbd2991d422bbcfb5b07d4acbb0e75aaf53b1a76c
 size 1056441258

pytorch_model-00005-of-00006.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4182de7c0df21447a5b7ed8cbb68162e85be061f23ddab73b8be157049fb9e31
-size 7403239886

pytorch_model.bin.index.json CHANGED Viewed

@@ -3,548 +3,548 @@
     "total_size": 40177428480
   },
   "weight_map": {
-    "lm_head.weight": "pytorch_model-00006-of-00006.bin",
-    "model.embed_tokens.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.18.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00006.bin",
-    "model.layers.24.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.25.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.26.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.26.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.27.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.27.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.28.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.28.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.29.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.29.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.31.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.32.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.32.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.32.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.33.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.33.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.33.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.34.input_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.34.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.34.mlp.up_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.35.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.35.mlp.down_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.35.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00003-of-00006.bin",
-    "model.layers.36.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.36.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.36.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.37.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.37.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.37.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.38.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.38.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.38.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.39.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.39.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.39.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.40.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.40.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.40.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.41.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.41.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.41.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.42.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.42.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.42.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.43.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.43.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.43.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.44.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.44.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.44.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.45.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.45.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.45.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.46.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.46.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.46.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.47.input_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.47.mlp.down_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.47.mlp.up_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.48.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.48.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.48.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00004-of-00006.bin",
-    "model.layers.49.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.49.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.49.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.50.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.50.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.50.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.51.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.51.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.51.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.52.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.52.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.52.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.52.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.52.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.52.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.52.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.52.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.52.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.53.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.53.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.53.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.53.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.53.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.53.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.53.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.53.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.53.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.54.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.54.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.54.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.54.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.54.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.54.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.54.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.54.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.54.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.55.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.55.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.55.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.55.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.55.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.55.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.55.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.55.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.55.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.56.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.56.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.56.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.56.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.56.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.56.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.56.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.56.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.56.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.57.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.57.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.57.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.57.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.57.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.57.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.57.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.57.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.57.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.58.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.58.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.58.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.58.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.58.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.58.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.58.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.58.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.58.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.59.input_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.59.mlp.down_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.59.mlp.gate_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.59.mlp.up_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.59.post_attention_layernorm.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.59.self_attn.k_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.59.self_attn.o_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.59.self_attn.q_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.59.self_attn.v_proj.weight": "pytorch_model-00005-of-00006.bin",
-    "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00006.bin",
-    "model.norm.weight": "pytorch_model-00005-of-00006.bin"
   }
 }

     "total_size": 40177428480
   },
   "weight_map": {
+    "lm_head.weight": "pytorch_model-00005-of-00005.bin",
+    "model.embed_tokens.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.17.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.17.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.17.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.18.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.18.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.18.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.19.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.19.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.19.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.20.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.20.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.20.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.21.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.21.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.21.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.22.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.22.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.22.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.23.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.23.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.23.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.24.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.24.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.24.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.25.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.25.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.25.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.26.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.26.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.26.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.29.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.29.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00005.bin",
+    "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.31.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.32.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.32.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.32.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.33.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.33.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.33.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.34.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.34.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.34.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.35.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.35.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.35.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.36.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.36.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.36.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.37.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.37.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.37.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.38.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.38.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.38.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.39.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.39.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.39.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.40.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.40.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.40.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.41.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.41.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.41.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.42.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.42.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.42.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.43.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.43.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.43.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.44.input_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.44.mlp.down_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.44.mlp.up_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.45.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.45.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.45.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00003-of-00005.bin",
+    "model.layers.46.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.46.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.46.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.47.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.47.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.47.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.48.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.48.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.48.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.49.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.49.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.49.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.50.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.50.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.50.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.51.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.51.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.51.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.52.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.52.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.52.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.52.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.52.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.52.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.52.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.52.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.52.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.53.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.53.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.53.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.53.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.53.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.53.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.53.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.53.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.53.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.54.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.54.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.54.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.54.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.54.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.54.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.54.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.54.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.54.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.55.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.55.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.55.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.55.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.55.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.55.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.55.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.55.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.55.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.56.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.56.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.56.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.56.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.56.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.56.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.56.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.56.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.56.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.57.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.57.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.57.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.57.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.57.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.57.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.57.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.57.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.57.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.58.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.58.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.58.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.58.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.58.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.58.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.58.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.58.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.58.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.59.input_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.59.mlp.down_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.59.mlp.gate_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.59.mlp.up_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.59.post_attention_layernorm.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.59.self_attn.k_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.59.self_attn.o_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.59.self_attn.q_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.59.self_attn.v_proj.weight": "pytorch_model-00004-of-00005.bin",
+    "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00005.bin",
+    "model.norm.weight": "pytorch_model-00004-of-00005.bin"
   }
 }

tokenization_internlm.py CHANGED Viewed

@@ -1,7 +1,10 @@
 # coding=utf-8
-# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
 #
-# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +18,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for InternLM."""
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
@@ -32,7 +35,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
 PRETRAINED_VOCAB_FILES_MAP = {}
-# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer -> InternLM2Tokenizer
 class InternLMTokenizer(PreTrainedTokenizer):
     """
     Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
@@ -78,6 +81,8 @@ class InternLMTokenizer(PreTrainedTokenizer):
             **kwargs,
         )
     @property
     def no_prefix_space_tokens(self):
         if self._no_prefix_space_tokens is None:

 # coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Tokenization classes for IntermLM."""
 import os
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 PRETRAINED_VOCAB_FILES_MAP = {}
 class InternLMTokenizer(PreTrainedTokenizer):
     """
     Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
             **kwargs,
         )
+        """ Initialization"""
     @property
     def no_prefix_space_tokens(self):
         if self._no_prefix_space_tokens is None: