Upload model

Browse files

Files changed (5) hide show

BranchyModel.py +224 -233
model-00001-of-00003.safetensors +2 -2
model-00002-of-00003.safetensors +2 -2
model-00003-of-00003.safetensors +2 -2
model.safetensors.index.json +469 -469

BranchyModel.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import torch
 import logging
 import torch.nn as nn
@@ -13,7 +15,10 @@ from transformers import AutoModelForCausalLM, PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import ModelOutput
 from transformers.cache_utils import Cache, DynamicCache
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -85,236 +90,55 @@ class Branch(nn.Module):
         return x
-class BranchyModel(PreTrainedModel):
-    """
-    A wrapper class for transformer causal models, introducing branch functionality to enable conditional computation and
-    reduce computational load by selectively processing parts of the input through different branches.
-    The BranchyModel class allows for the addition of branches at specified layers within the transformer model. Each branch
-    can output predictions independently, enabling early exits or auxiliary tasks. This class supports different loss
-    functions for training these branches in a self-supervised manner, with optional penalties to encourage diversity
-    or reduce complexity in the branches' outputs.
-    Parameters:
-        config (BranchyModelConfig): Configuration class for BranchyModel. It contains all necessary parameters for
-            the model's architecture, branching locations, loss types, etc.
-        model (PreTrainedModel): The underlying transformer model around which the BranchyModel is built. This model
-            should be an instance of a class derived from `transformers.PreTrainedModel`.
-    Attributes:
-        model (PreTrainedModel): The underlying transformer model provided during initialization.
-        branch_locations (List[int]): Indices indicating the transformer layers after which branches are added.
-        penalty_weight (Optional[float]): The weight of the penalty term in the "penalized_cross_entropy" loss. This
-            argument must be provided and greater than 0 if "penalized_cross_entropy" is used.
-        window_size (int): The size of the token window that each branch processes. This allows branches to only
-            consider a subset of the most recent tokens, reducing the computational requirements.
-    Examples:
-        config = BranchyModelConfig(
-            branch_locations=[2, 4, 7],
-            window_size=256
-        )
-        underlying_model = AutoModelForCausalLM.from_pretrained('gpt2')
-        branchy_model = BranchyModel(config, underlying_model)
-        # For inference
-        inputs = tokenizer("Example input text", return_tensors="pt")
-        outputs = branchy_model(**inputs, fixed_output_head=2)  # Use the output from the branch after the 2nd layer
-        # For training with self-supervision
-        branchy_model.train()
-        outputs = branchy_model(**inputs, self_supervision=True)
-    Note:
-        This class is designed to work seamlessly with the Hugging Face Transformers library. It requires a model
-        configuration (`BranchyModelConfig`) that extends the base configuration class from the Transformers library.
     """
     config_class = BranchyModelConfig
     def __init__(self,
                  config: BranchyModelConfig):
-        """
-        Initializes the BranchyModel.
-        Precisely: Get the number of layers in the underlying model, check that specified branch locations are within the range of the model's layers, and initialize branches at specified locations.
-        Args:
-            config (BranchyModelConfig): Configuration object for the branchy model, containing settings such as
-                branch locations, loss types, and window sizes.
-            model (PreTrainedModel): The underlying transformer model to which branching functionality will be added.
-        """
         super().__init__(config)
         self.model = AutoModelForCausalLM.from_pretrained(config.model_str)
-        # Get the number of layers in the underlying model
-        if hasattr(self.model.config, "n_layer") or hasattr(
-            self.model.config, "num_hidden_layers"
-        ):  # If there is no n_layer in the config, there might be ways to get it from the model itself
             self.num_layers = (
                 self.model.config.n_layer
                 if hasattr(self.model.config, "n_layer")
                 else self.model.config.num_hidden_layers
             )
             assert self.num_layers is not None and self.num_layers > 0, "n_layer must be a positive integer."
-            logger.debug(f"Number of layers in the model: {self.num_layers}")
         else:
             raise ValueError("cannot find n_layer in config")
-        assert config.branch_number > 0 and config.branch_number < self.num_layers, "branch_number must be a positive integer less than the number of layers in the model."
         # If we provide only the number of branches, we will distribute them evenly across the model
         if config.branch_locations is None:
             interval = self.num_layers // (config.branch_number + 1)
             config.branch_locations = [i * interval for i in range(1, config.branch_number+1)]
         # Check that specified branch locations are within the range of the model's layers
         if any([loc >= self.num_layers for loc in config.branch_locations]):
             raise ValueError("Branch location exceeds the number of layers in the model.")
-        # Ensure the model's parameters are frozen
-        for param in self.model.parameters():
-            param.requires_grad = False
-        # Initialize branches at specified locations
         self.branches = torch.nn.ModuleList()
-        # if copy_lm_head  is True, we copy the last lm_head of the model instead of initializing new ones
         if config.copy_lm_head:
             logger.info("Fine-tuning branches")
             for branch in config.branch_locations:
-                self.branches.append(copy.deepcopy(self.model.lm_head))
         else:
             for _ in config.branch_locations:
                 new_branch = Branch(self.model.config)
                 new_branch.apply(self.model._init_weights)
                 self.branches.append(new_branch)
-        for param in self.branches.parameters():
-            param.requires_grad = True
-        self.post_init()
-    def get_num_params(self,
-                       return_dict: bool = True):
-        """
-        Get the number of parameters in the model.
-        Args:
-            return_dict (bool): Whether to return the number of parameters in a dictionary format. Defaults to True.
-        Returns:
-            int: The number of parameters in the model.
-        """
-        num_params = sum(p.numel() for p in self.parameters())
-        if return_dict:
-            return {"backbone": sum(p.numel() for p in self.model.parameters()), "branches": sum(p.numel() for p in self.branches.parameters()), "total": num_params}
-        return num_params
-    def forward(self,
-                input_ids: torch.LongTensor = None,
-                attention_mask: Optional[torch.Tensor] = None,
-                position_ids: Optional[torch.LongTensor] = None,
-                past_key_values: Optional[List[torch.FloatTensor]] = None,
-                inputs_embeds: Optional[torch.FloatTensor] = None,
-                labels: Optional[torch.LongTensor] = None,
-                use_cache: Optional[bool] = None,
-                output_attentions: Optional[bool] = None,
-                output_hidden_states: Optional[bool] = None,
-                return_dict: Optional[bool] = None,
-                head_window_size: Optional[int] = None,
-                ):
-        output_hidden_states = True
-        if labels is not None:
-            raise NotImplementedError("BranchyLLM only supports self-supervision")
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        if not hasattr(outputs, "hidden_states") or outputs.hidden_states is None:
-            raise ValueError("The model must return hidden states")
-        heads_logits = []
-        for i, branch in enumerate(self.config.branch_locations):
-            if head_window_size is not None:
-                current_hidden_state = outputs.hidden_states[branch, :, -head_window_size:, :]
-            else:
-                current_hidden_state = outputs.hidden_states[branch]
-            heads_logits.append(self.branches[i](current_hidden_state))
-        heads_logits = torch.stack(heads_logits, dim=0)
-        losses_dict = self.compute_self_supervision_loss(
-            heads_logits, outputs.logits
-        )
-        return CausalBranchyLLMOutputWithPast(
-            loss=losses_dict["loss"],
-            head_loss=losses_dict["head_losses"],
-            entropy=losses_dict["entropy"],
-            entropies=losses_dict["entropies"],
-            logits=outputs.logits, # shape (batch_size, seq_len, vocab_size)
-            head_logits=heads_logits, # shape (num_branches, batch_size, seq_len, vocab_size)
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-    def compute_self_supervision_loss(self,
-                                      aux_logits: torch.Tensor,
-                                      lm_logits: torch.Tensor,
-                                      ) -> Dict[str, torch.Tensor]:
-        last_aux_logits = aux_logits[..., -1, :]
-        last_lm_logits = lm_logits[..., -1, :]
-        losses = []
-        entropies = []
-        # Can be useful to have detailed loss per head for comparison of performance
-        for head_logit in last_aux_logits:
-            ce_loss = nn.CrossEntropyLoss(reduction="mean")(
-                head_logit, torch.argmax(last_lm_logits, dim=-1)
-            )
-            probas = F.softmax(head_logit, dim=-1)
-            log_probas = torch.log(probas + 1e-8)
-            assert not torch.isnan(log_probas).any(), "NaNs found in log_probas"
-            entropy = -torch.sum(probas * log_probas, dim=-1)
-            assert not torch.isnan(entropy).any(), "NaNs found in entropy before mean"
-            entropy = torch.mean(entropy)
-            entropies.append(entropy)
-            losses.append((1 - self.config.penalty_weight) * ce_loss - self.config.penalty_weight * entropy)
-        loss = torch.stack(losses, dim=0).mean(dim=-1) # TODO does it change training dynamics between mean and sum?
-        entropy = torch.stack(entropies, dim=0).mean(dim=-1)
-        return {"loss": loss,
-                "head_losses": torch.stack(losses, dim=0),
-                "entropies": torch.stack(entropies, dim=0),
-                "entropy": entropy
-                }
-class BranchyCausalModel(PreTrainedModel):
-    """A class for Causal branchy Model, this one integrate the early exit mechanism and only output one logit on each step as a conventional model.
-    """
-    config_class = BranchyModelConfig
-    def __init__(self,
-                 config: BranchyModelConfig):
-        super().__init__(config)
-        self.model = BranchyModel(config)
-        self.head_thresholds = torch.tensor(config.head_thresholds)
-        if config.confidence_metric == "breaking_ties":
-            self.confidence_metric_fn = breaking_ties
-        elif config.confidence_metric == "max":
-            self.confidence_metric_fn = lambda x: torch.max(x, dim=-1).values
-        else:
-            raise ValueError("confidence_metric must be 'breaking_ties' or 'max'.")
         self.post_init()
     def to(self, *args, **kwargs):
@@ -368,7 +192,7 @@ class BranchyCausalModel(PreTrainedModel):
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids}
         model_inputs.update(
             {
                 "position_ids": position_ids,
@@ -377,55 +201,218 @@ class BranchyCausalModel(PreTrainedModel):
                 "attention_mask": attention_mask,
             }
         )
         return model_inputs
     def forward(self,
                 input_ids: torch.LongTensor = None,
                 attention_mask: Optional[torch.Tensor] = None,
                 position_ids: Optional[torch.LongTensor] = None,
                 past_key_values: Optional[List[torch.FloatTensor]] = None,
                 inputs_embeds: Optional[torch.FloatTensor] = None,
-                labels: Optional[torch.LongTensor] = None,
                 use_cache: Optional[bool] = None,
                 output_attentions: Optional[bool] = None,
                 output_hidden_states: Optional[bool] = None,
                 return_dict: Optional[bool] = None,
                 head_window_size: Optional[int] = None,
                 ):
-        # TODO Only POC, actual early exit implementation should unwrap the self.model call, which means specific integration for each supported model
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            head_window_size=head_window_size
-        )
-        end_logits = None
-        scores = self.confidence_metric_fn(outputs.head_logits)[..., -1] # shape [branches, batch]
-        is_early_exited = self.head_thresholds[:, None] < scores # shape [branches, batch]
-        is_early_exited = F.pad(is_early_exited, (0, 0, 0, 1), value=True) # shape [branches+1, batch] -> Adds a row of True at the bottom. i.e the last head is right
-        head_indices = torch.argmax(is_early_exited.int(), dim=0) # shape [batch]
-        full_logits = torch.cat([outputs.head_logits, outputs.logits.unsqueeze(0)], dim=0) # shape [branches+1, batch, seq_len, vocab_size]
-        #logger.info(full_logits[:,:,-1,0])
-        end_logits = full_logits[head_indices, torch.arange(full_logits.shape[1]), :, :] # shape [batch, seq, vocab_size]
-        #logger.info(full_logits[head_indices, torch.arange(full_logits.shape[1]), -1, 0])
-        logger.debug(f"Batch early exit heads : {head_indices}")
-        return CausalLMOutputWithPastAndHead(
-            loss=outputs.loss,
-            logits=end_logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            head_indices=head_indices
         )
 @dataclass
@@ -439,7 +426,11 @@ class CausalBranchyLLMOutputWithPast(ModelOutput):
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 @dataclass
-class CausalLMOutputWithPastAndHead(CausalLMOutputWithPast):
-    head_indices: Optional[torch.Tensor] = None

+from collections import OrderedDict
+from hamcrest import is_
 import torch
 import logging
 import torch.nn as nn
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import ModelOutput
 from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         return x
+class BranchyCausalModel(PreTrainedModel):
+    """A class for Causal branchy Model, this one integrate the early exit mechanism and only output one logit on each step as a conventional model.
     """
     config_class = BranchyModelConfig
     def __init__(self,
                  config: BranchyModelConfig):
         super().__init__(config)
         self.model = AutoModelForCausalLM.from_pretrained(config.model_str)
+        self.lm_head = self.model.lm_head
+        self.vocab_size = self.model.vocab_size
+        self.model = self.model.model
+        self.head_thresholds = torch.tensor(config.head_thresholds)
+        self.confidence_metric_fn = breaking_ties
+        # Get number of layer from main model
+        if hasattr(self.model.config, "n_layer") or hasattr(self.model.config, "num_hidden_layers"):
             self.num_layers = (
                 self.model.config.n_layer
                 if hasattr(self.model.config, "n_layer")
                 else self.model.config.num_hidden_layers
             )
             assert self.num_layers is not None and self.num_layers > 0, "n_layer must be a positive integer."
         else:
             raise ValueError("cannot find n_layer in config")
+        assert config.branch_number < self.num_layers , "branch_number must be a positive integer less than the number of layers in the model."
         # If we provide only the number of branches, we will distribute them evenly across the model
         if config.branch_locations is None:
             interval = self.num_layers // (config.branch_number + 1)
             config.branch_locations = [i * interval for i in range(1, config.branch_number+1)]
         # Check that specified branch locations are within the range of the model's layers
         if any([loc >= self.num_layers for loc in config.branch_locations]):
             raise ValueError("Branch location exceeds the number of layers in the model.")
         self.branches = torch.nn.ModuleList()
         if config.copy_lm_head:
             logger.info("Fine-tuning branches")
             for branch in config.branch_locations:
+                self.branches.append(copy.deepcopy(self.lm_head))
         else:
             for _ in config.branch_locations:
                 new_branch = Branch(self.model.config)
                 new_branch.apply(self.model._init_weights)
                 self.branches.append(new_branch)
+        self.gradient_checkpointing = False
         self.post_init()
     def to(self, *args, **kwargs):
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids}
         model_inputs.update(
             {
                 "position_ids": position_ids,
                 "attention_mask": attention_mask,
             }
         )
         return model_inputs
+    def model_pre_forward(self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.model.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.model.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.model.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.model.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        past_key_values_length = 0
+        if self.model.gradient_checkpointing and self.model.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        use_legacy_cache = None
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.model.embed_tokens(input_ids)
+        inputs_embeds = self.model.embed_dropout(inputs_embeds)
+        # Attention mask.
+        if self.model._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self.model._use_sdpa and not output_attentions:
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+        return inputs_embeds, use_legacy_cache, attention_mask, position_ids, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict
     def forward(self,
                 input_ids: torch.LongTensor = None,
                 attention_mask: Optional[torch.Tensor] = None,
                 position_ids: Optional[torch.LongTensor] = None,
                 past_key_values: Optional[List[torch.FloatTensor]] = None,
                 inputs_embeds: Optional[torch.FloatTensor] = None,
                 use_cache: Optional[bool] = None,
                 output_attentions: Optional[bool] = None,
                 output_hidden_states: Optional[bool] = None,
                 return_dict: Optional[bool] = None,
                 head_window_size: Optional[int] = None,
                 ):
+        use_cache = False # Disable it for now TODO Update how cache is handled to allow early exits
+        inputs_embeds, use_legacy_cache, attention_mask, position_ids, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict = self.model_pre_forward(input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_logits = ()
+        is_early_exited = False
+        next_decoder_cache = None
+        for layer, decoder_layer in enumerate(self.model.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.model.gradient_checkpointing and self.model.training:
+                layer_outputs, use_legacy_cache = self.model._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                )
+                hidden_states = layer_outputs[0]
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+                hidden_states = layer_outputs[0]
+                if layer in self.config.branch_locations:
+                    logits = self.branches[self.config.branch_locations.index(layer)](layer_outputs[0])
+                    if not self.training:
+                        # During inference, calculate score on the fly to decide if we should early exit
+                        score = self.confidence_metric_fn(logits)[..., -1] # score for the classified token TODO migth be interesting to take score from whole vector ?
+                        if score > self.head_thresholds[self.config.branch_locations.index(layer)]:
+                            is_early_exited = True
+                            logger.debug(f"Early exit at layer {layer} with score {score}")
+                            break
+                    else:
+                        # if in training we return full logits
+                        all_logits += (logits,)
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        if not is_early_exited:
+            logger.debug(f"No early exit")
+            hidden_states = self.model.final_layernorm(hidden_states)
+            logits = self.lm_head(hidden_states)
+            logits = logits.float()
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        loss = [None, None, None, None]
+        if self.training:
+            loss = self.compute_self_supervision_loss(
+                torch.stack(all_logits), hidden_states
+            )
+        if not return_dict:
+            raise NotImplementedError("return_dict=False is not implemented")
+        return CausalBranchyLLMOutputWithPast(
+            loss=loss[0],
+            head_loss=loss[1],
+            entropies=loss[2],
+            entropy=loss[3],
+            logits=logits, # shape (batch_size, seq_len, vocab_size)
+            head_logits=all_logits, # shape (num_branches, batch_size, seq_len, vocab_size)
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            head_indices=layer,
+        )
+    def compute_self_supervision_loss(self,
+                                      aux_logits: torch.Tensor,
+                                      lm_logits: torch.Tensor,
+                                      return_dict: bool = True
+                                      ) -> Dict[str, torch.Tensor]:
+        last_aux_logits = aux_logits[..., -1, :]
+        last_lm_logits = lm_logits[..., -1, :]
+        losses = ()
+        entropies = ()
+        # Can be useful to have detailed loss per head for comparison of performance
+        for head_logit in last_aux_logits:
+            ce_loss = nn.CrossEntropyLoss(reduction="mean")(
+                head_logit, torch.argmax(last_lm_logits, dim=-1)
+            )
+            probas = F.softmax(head_logit, dim=-1)
+            log_probas = torch.log(probas + 1e-8)
+            assert not torch.isnan(log_probas).any(), "NaNs found in log_probas"
+            entropy = -torch.sum(probas * log_probas, dim=-1)
+            assert not torch.isnan(entropy).any(), "NaNs found in entropy before mean"
+            entropy = torch.mean(entropy)
+            entropies += (entropy,)
+            losses += ((1 - self.config.penalty_weight) * ce_loss - self.config.penalty_weight * entropy,)
+        loss = torch.stack(losses, dim=0).mean(dim=-1)
+        entropy = torch.stack(entropies, dim=0).mean(dim=-1)
+        if not return_dict:
+            return tuple(v for v in (loss, losses, entropy, entropies) if v is not None)
+        return SelfSupervisedLossOutput(
+                loss=loss,
+                head_losses= losses,
+                entropies= entropies,
+                entropy= entropy
         )
 @dataclass
     past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
+    head_indices: Optional[torch.Tensor] = None
 @dataclass
+class SelfSupervisedLossOutput(ModelOutput):
+    loss: torch.Tensor = None
+    head_losses: torch.Tensor = None
+    entropy: torch.Tensor = None
+    entropies: torch.Tensor = None

model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cff2a27adc1e9a8965a31a8406a6bee8df4ea5bdf2df018a460218abba1ac64d
-size 4982357920

 version https://git-lfs.github.com/spec/v1
+oid sha256:de9690424cd10d30cc5bbbf31b5ba7149fe2d1b4d1c9b3e28378c37496dfddcc
+size 4982355512

model-00002-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2b7879268f3bd6382559c9cbbb7d7252381329e068804ec6ebc6d21ee2995e5b
-size 4982544624

 version https://git-lfs.github.com/spec/v1
+oid sha256:526f616bb5753775548b200b2d5afffa862bf3a17cf53c004b1ba8d702fb5890
+size 4982541984

model-00003-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3adcc60cfcc21b897661ad4e89202f22c88b47682f665c8aa664cb0f6a044edc
-size 3251942824

 version https://git-lfs.github.com/spec/v1
+oid sha256:963120cc9ecfbdd250e505d8a33ef881aa1cc393b06fe3bc9a7b7be286c3c242
+size 3251942344

model.safetensors.index.json CHANGED Viewed

@@ -3,474 +3,474 @@
     "total_size": 13216788480
   },
   "weight_map": {
-    "model.branches.0.layernorm.bias": "model-00003-of-00003.safetensors",
-    "model.branches.0.layernorm.weight": "model-00003-of-00003.safetensors",
-    "model.branches.0.lm_head.bias": "model-00003-of-00003.safetensors",
-    "model.branches.0.lm_head.weight": "model-00003-of-00003.safetensors",
-    "model.branches.1.layernorm.bias": "model-00003-of-00003.safetensors",
-    "model.branches.1.layernorm.weight": "model-00003-of-00003.safetensors",
-    "model.branches.1.lm_head.bias": "model-00003-of-00003.safetensors",
-    "model.branches.1.lm_head.weight": "model-00003-of-00003.safetensors",
-    "model.branches.2.layernorm.bias": "model-00003-of-00003.safetensors",
-    "model.branches.2.layernorm.weight": "model-00003-of-00003.safetensors",
-    "model.branches.2.lm_head.bias": "model-00003-of-00003.safetensors",
-    "model.branches.2.lm_head.weight": "model-00003-of-00003.safetensors",
-    "model.branches.3.layernorm.bias": "model-00003-of-00003.safetensors",
-    "model.branches.3.layernorm.weight": "model-00003-of-00003.safetensors",
-    "model.branches.3.lm_head.bias": "model-00003-of-00003.safetensors",
-    "model.branches.3.lm_head.weight": "model-00003-of-00003.safetensors",
-    "model.model.lm_head.bias": "model-00003-of-00003.safetensors",
-    "model.model.lm_head.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.embed_tokens.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.final_layernorm.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.final_layernorm.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.0.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.14.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.14.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.14.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.14.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.14.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.14.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.14.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.2.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.20.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.input_layernorm.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.mlp.fc1.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.mlp.fc1.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.mlp.fc2.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.mlp.fc2.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.self_attn.dense.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.self_attn.dense.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
-    "model.model.model.layers.3.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.30.input_layernorm.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.mlp.fc1.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.mlp.fc1.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.mlp.fc2.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.mlp.fc2.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.self_attn.dense.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.self_attn.dense.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.input_layernorm.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.mlp.fc1.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.mlp.fc1.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.mlp.fc2.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.mlp.fc2.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.self_attn.dense.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.self_attn.dense.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
-    "model.model.model.layers.4.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.input_layernorm.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.self_attn.dense.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.self_attn.dense.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
-    "model.model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors"
   }
 }

     "total_size": 13216788480
   },
   "weight_map": {
+    "branches.0.layernorm.bias": "model-00003-of-00003.safetensors",
+    "branches.0.layernorm.weight": "model-00003-of-00003.safetensors",
+    "branches.0.lm_head.bias": "model-00003-of-00003.safetensors",
+    "branches.0.lm_head.weight": "model-00003-of-00003.safetensors",
+    "branches.1.layernorm.bias": "model-00003-of-00003.safetensors",
+    "branches.1.layernorm.weight": "model-00003-of-00003.safetensors",
+    "branches.1.lm_head.bias": "model-00003-of-00003.safetensors",
+    "branches.1.lm_head.weight": "model-00003-of-00003.safetensors",
+    "branches.2.layernorm.bias": "model-00003-of-00003.safetensors",
+    "branches.2.layernorm.weight": "model-00003-of-00003.safetensors",
+    "branches.2.lm_head.bias": "model-00003-of-00003.safetensors",
+    "branches.2.lm_head.weight": "model-00003-of-00003.safetensors",
+    "branches.3.layernorm.bias": "model-00003-of-00003.safetensors",
+    "branches.3.layernorm.weight": "model-00003-of-00003.safetensors",
+    "branches.3.lm_head.bias": "model-00003-of-00003.safetensors",
+    "branches.3.lm_head.weight": "model-00003-of-00003.safetensors",
+    "lm_head.bias": "model-00003-of-00003.safetensors",
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.final_layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.final_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.0.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.25.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.26.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.27.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.28.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.28.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.layers.29.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.layers.29.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.3.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.30.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.dense.bias": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.dense.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.dense.bias": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.dense.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors"
   }
 }