Upload model

Browse files

Files changed (8) hide show

BranchyModel.py +381 -0
BranchyModelConfig.py +78 -0
README.md +199 -0
config.json +31 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +476 -0

BranchyModel.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import torch
+import logging
+import torch.nn as nn
+import torch.nn.functional as F
+import copy
+from dataclasses import dataclass
+from torch import Tensor
+from .BranchyModelConfig import BranchyModelConfig
+from typing import List, Optional, Dict, Tuple
+from transformers import AutoModelForCausalLM, PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.utils import ModelOutput
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def breaking_ties(tensor: torch.Tensor):
+    """
+    Break ties in a tensor by subtracting the second highest value from the highest value.
+    Args:
+        tensor (torch.Tensor): The tensor to break ties in. shape [..., vocab_size]
+    Returns:
+        torch.Tensor: The tensor with ties broken. shape [...]
+    Example:
+    Input : Tensor of shape [head_number, batch, seq_len, vocab_size]
+    Output: Tensor of shape [head_number, batch, seq_len]
+    """
+    return torch.sub(torch.topk(tensor, 2, dim=-1).values[..., 0], torch.topk(tensor, 2, dim=-1).values[..., 1])
+class Branch(nn.Module):
+    """
+    A branch module for use in the BranchyModel, representing an auxiliary output head attached at a specified layer
+    within a transformer model. Each branch processes the output of its corresponding layer and produces an output
+    which can be used for early exits or auxiliary tasks.
+    This class is designed to be flexible, allowing for different configurations of the linear layer based on the
+    underlying model's architecture.
+    Attributes:
+        layernorm (torch.nn.LayerNorm): Applies Layer Normalization over a mini-batch of inputs.
+        lm_head (torch.nn.Linear): The linear layer that maps the hidden states to the vocabulary size, producing
+            the output logits for each token in the sequence.
+    Example Usage:
+        # Assuming `config` is an instance of the model's configuration class with attributes `hidden_size` and
+        # `vocab_size` properly set.
+        branch = Branch(config)
+        # `x` is a tensor representing the output from a transformer layer, shaped as [batch_size, seq_length, hidden_size]
+        output_logits = branch(x)
+    """
+    def __init__(self, config: BranchyModelConfig):
+        """
+        Initializes the Branch module.
+        Args:
+            config (PretrainedConfig): The configuration object containing parameters like hidden size and vocabulary
+            size. This object provides the necessary settings for initializing the layer normalization and linear
+            layers within the Branch.
+        """
+        super().__init__()
+        self.layernorm: nn.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.lm_head: nn.Linear = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Forward pass through the Branch module.
+        Args:
+            x (Tensor): Input tensor of shape [batch_size, seq_length, hidden_size], representing the output
+            from a transformer layer.
+        Returns:
+            Tensor: Output logits of shape [batch_size, seq_length, vocab_size], resulting from passing the
+            input through layer normalization and a linear layer.
+        """
+        x = self.layernorm(x)
+        x = self.lm_head(x)
+        return x
+class BranchyModel(PreTrainedModel):
+    """
+    A wrapper class for transformer causal models, introducing branch functionality to enable conditional computation and
+    reduce computational load by selectively processing parts of the input through different branches.
+    The BranchyModel class allows for the addition of branches at specified layers within the transformer model. Each branch
+    can output predictions independently, enabling early exits or auxiliary tasks. This class supports different loss
+    functions for training these branches in a self-supervised manner, with optional penalties to encourage diversity
+    or reduce complexity in the branches' outputs.
+    Parameters:
+        config (BranchyModelConfig): Configuration class for BranchyModel. It contains all necessary parameters for
+            the model's architecture, branching locations, loss types, etc.
+        model (PreTrainedModel): The underlying transformer model around which the BranchyModel is built. This model
+            should be an instance of a class derived from `transformers.PreTrainedModel`.
+    Attributes:
+        model (PreTrainedModel): The underlying transformer model provided during initialization.
+        branch_locations (List[int]): Indices indicating the transformer layers after which branches are added.
+        penalty_weight (Optional[float]): The weight of the penalty term in the "penalized_cross_entropy" loss. This
+            argument must be provided and greater than 0 if "penalized_cross_entropy" is used.
+        window_size (int): The size of the token window that each branch processes. This allows branches to only
+            consider a subset of the most recent tokens, reducing the computational requirements.
+    Examples:
+        config = BranchyModelConfig(
+            branch_locations=[2, 4, 7],
+            window_size=256
+        )
+        underlying_model = AutoModelForCausalLM.from_pretrained('gpt2')
+        branchy_model = BranchyModel(config, underlying_model)
+        # For inference
+        inputs = tokenizer("Example input text", return_tensors="pt")
+        outputs = branchy_model(**inputs, fixed_output_head=2)  # Use the output from the branch after the 2nd layer
+        # For training with self-supervision
+        branchy_model.train()
+        outputs = branchy_model(**inputs, self_supervision=True)
+    Note:
+        This class is designed to work seamlessly with the Hugging Face Transformers library. It requires a model
+        configuration (`BranchyModelConfig`) that extends the base configuration class from the Transformers library.
+    """
+    config_class = BranchyModelConfig
+    def __init__(self,
+                 config: BranchyModelConfig):
+        """
+        Initializes the BranchyModel.
+        Precisely: Get the number of layers in the underlying model, check that specified branch locations are within the range of the model's layers, and initialize branches at specified locations.
+        Args:
+            config (BranchyModelConfig): Configuration object for the branchy model, containing settings such as
+                branch locations, loss types, and window sizes.
+            model (PreTrainedModel): The underlying transformer model to which branching functionality will be added.
+        """
+        super().__init__(config)
+        self.model = AutoModelForCausalLM.from_pretrained(config.model_str)
+        # Get the number of layers in the underlying model
+        if hasattr(self.model.config, "n_layer") or hasattr(
+            self.model.config, "num_hidden_layers"
+        ):  # If there is no n_layer in the config, there might be ways to get it from the model itself
+            self.num_layers = (
+                self.model.config.n_layer
+                if hasattr(self.model.config, "n_layer")
+                else self.model.config.num_hidden_layers
+            )
+            assert self.num_layers is not None and self.num_layers > 0, "n_layer must be a positive integer."
+            logger.debug(f"Number of layers in the model: {self.num_layers}")
+        else:
+            raise ValueError("cannot find n_layer in config")
+        assert config.branch_number > 0 and config.branch_number < self.num_layers, "branch_number must be a positive integer less than the number of layers in the model."
+        # If we provide only the number of branches, we will distribute them evenly across the model
+        if config.branch_locations is None:
+            interval = self.num_layers // (config.branch_number + 1)
+            config.branch_locations = [i * interval for i in range(1, config.branch_number+1)]
+        # Check that specified branch locations are within the range of the model's layers
+        if any([loc >= self.num_layers for loc in config.branch_locations]):
+            raise ValueError("Branch location exceeds the number of layers in the model.")
+        # Ensure the model's parameters are frozen
+        for param in self.model.parameters():
+            param.requires_grad = False
+        # Initialize branches at specified locations
+        self.branches = torch.nn.ModuleList()
+        # if copy_lm_head  is True, we copy the last lm_head of the model instead of initializing new ones
+        if config.copy_lm_head:
+            logger.info("Fine-tuning branches")
+            for branch in config.branch_locations:
+                self.branches.append(copy.deepcopy(self.model.lm_head))
+        else:
+            for _ in config.branch_locations:
+                new_branch = Branch(self.model.config)
+                new_branch.apply(self.model._init_weights)
+                self.branches.append(new_branch)
+        for param in self.branches.parameters():
+            param.requires_grad = True
+        self.post_init()
+    def get_num_params(self,
+                       return_dict: bool = True):
+        """
+        Get the number of parameters in the model.
+        Args:
+            return_dict (bool): Whether to return the number of parameters in a dictionary format. Defaults to True.
+        Returns:
+            int: The number of parameters in the model.
+        """
+        num_params = sum(p.numel() for p in self.parameters())
+        if return_dict:
+            return {"backbone": sum(p.numel() for p in self.model.parameters()), "branches": sum(p.numel() for p in self.branches.parameters()), "total": num_params}
+        return num_params
+    def forward(self,
+                input_ids: torch.LongTensor = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                past_key_values: Optional[List[torch.FloatTensor]] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                head_window_size: Optional[int] = None,
+                ):
+        output_hidden_states = True
+        if labels is not None:
+            raise NotImplementedError("BranchyLLM only supports self-supervision")
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not hasattr(outputs, "hidden_states") or outputs.hidden_states is None:
+            raise ValueError("The model must return hidden states")
+        heads_logits = []
+        for i, branch in enumerate(self.config.branch_locations):
+            if head_window_size is not None:
+                current_hidden_state = outputs.hidden_states[branch, :, -head_window_size:, :]
+            else:
+                current_hidden_state = outputs.hidden_states[branch]
+            heads_logits.append(self.branches[i](current_hidden_state))
+        heads_logits = torch.stack(heads_logits, dim=0)
+        losses_dict = self.compute_self_supervision_loss(
+            heads_logits, outputs.logits
+        )
+        return CausalBranchyLLMOutputWithPast(
+            loss=losses_dict["loss"],
+            head_loss=losses_dict["head_losses"],
+            entropy=losses_dict["entropy"],
+            entropies=losses_dict["entropies"],
+            logits=outputs.logits, # shape (batch_size, seq_len, vocab_size)
+            head_logits=heads_logits, # shape (num_branches, batch_size, seq_len, vocab_size)
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def compute_self_supervision_loss(self,
+                                      aux_logits: torch.Tensor,
+                                      lm_logits: torch.Tensor,
+                                      ) -> Dict[str, torch.Tensor]:
+        last_aux_logits = aux_logits[..., -1, :]
+        last_lm_logits = lm_logits[..., -1, :]
+        losses = []
+        entropies = []
+        # Can be useful to have detailed loss per head for comparison of performance
+        for head_logit in last_aux_logits:
+            ce_loss = nn.CrossEntropyLoss(reduction="mean")(
+                head_logit, torch.argmax(last_lm_logits, dim=-1)
+            )
+            probas = F.softmax(head_logit, dim=-1)
+            log_probas = torch.log(probas + 1e-8)
+            assert not torch.isnan(log_probas).any(), "NaNs found in log_probas"
+            entropy = -torch.sum(probas * log_probas, dim=-1)
+            assert not torch.isnan(entropy).any(), "NaNs found in entropy before mean"
+            entropy = torch.mean(entropy)
+            entropies.append(entropy)
+            losses.append((1 - self.config.penalty_weight) * ce_loss - self.config.penalty_weight * entropy)
+        loss = torch.stack(losses, dim=0).mean(dim=-1) # TODO does it change training dynamics between mean and sum?
+        entropy = torch.stack(entropies, dim=0).mean(dim=-1)
+        return {"loss": loss,
+                "head_losses": torch.stack(losses, dim=0),
+                "entropies": torch.stack(entropies, dim=0),
+                "entropy": entropy
+                }
+class BranchyCausalModel(PreTrainedModel):
+    """A class for Causal branchy Model, this one integrate the early exit mechanism and only output one logit on each step as a conventional model.
+    """
+    config_class = BranchyModelConfig
+    def __init__(self,
+                 config: BranchyModelConfig):
+        super().__init__(config)
+        self.model = BranchyModel(config)
+        self.head_thresholds = torch.tensor(config.head_thresholds).to(config.device)
+        if config.confidence_metric == "breaking_ties":
+            self.confidence_metric_fn = breaking_ties
+        elif config.confidence_metric == "max":
+            self.confidence_metric_fn = lambda x: torch.max(x, dim=-1).values
+        else:
+            raise ValueError("confidence_metric must be 'breaking_ties' or 'max'.")
+        self.post_init()
+    def forward(self,
+                input_ids: torch.LongTensor = None,
+                attention_mask: Optional[torch.Tensor] = None,
+                position_ids: Optional[torch.LongTensor] = None,
+                past_key_values: Optional[List[torch.FloatTensor]] = None,
+                inputs_embeds: Optional[torch.FloatTensor] = None,
+                labels: Optional[torch.LongTensor] = None,
+                use_cache: Optional[bool] = None,
+                output_attentions: Optional[bool] = None,
+                output_hidden_states: Optional[bool] = None,
+                return_dict: Optional[bool] = None,
+                head_window_size: Optional[int] = None,
+                ):
+        # TODO Only POC, actual early exit implementation should unwrap the self.model call, which means specific integration for each supported model
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            head_window_size=head_window_size
+        )
+        end_logits = None
+        scores = self.confidence_metric_fn(outputs.head_logits)[..., -1] # shape [branches, batch]
+        is_early_exited = self.head_thresholds[:, None] < scores # shape [branches, batch]
+        is_early_exited = F.pad(is_early_exited, (0, 0, 0, 1), value=True) # shape [branches+1, batch] -> Adds a row of True at the bottom. i.e the last head is right
+        head_indices = torch.argmax(is_early_exited.int(), dim=0) # shape [batch]
+        full_logits = torch.cat([outputs.head_logits, outputs.logits.unsqueeze(0)], dim=0) # shape [branches+1, batch, seq_len, vocab_size]
+        #logger.info(full_logits[:,:,-1,0])
+        end_logits = full_logits[head_indices, torch.arange(full_logits.shape[1]), :, :] # shape [batch, seq, vocab_size]
+        #logger.info(full_logits[head_indices, torch.arange(full_logits.shape[1]), -1, 0])
+        logger.debug(f"Batch early exit heads : {head_indices}")
+        return CausalLMOutputWithPastAndHead(
+            loss=outputs.loss,
+            logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            head_indices=head_indices
+        )
+@dataclass
+class CausalBranchyLLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.Tensor] = None # Main loss
+    head_loss: Optional[torch.Tensor] = None
+    entropy: Optional[torch.Tensor] = None
+    entropies: Optional[Tuple[torch.Tensor]] = None
+    logits: torch.Tensor = None
+    head_logits: Optional[torch.Tensor] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class CausalLMOutputWithPastAndHead(CausalLMOutputWithPast):
+    head_indices: Optional[torch.Tensor] = None

BranchyModelConfig.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from typing import List, Optional
+from transformers import PretrainedConfig
+import logging
+logger = logging.getLogger(__name__)
+class BranchyModelConfig(PretrainedConfig):
+    """
+    Configuration class for BranchyModel. This class extends the PretrainedConfig class from the Transformers
+    library, providing configuration specific to models with branch functionality.
+    Attributes:
+        branch_locations (List[int]): Specifies the indices of layers after which branches are added. These indices
+            start from 0, and each index represents a layer in the underlying transformer model.
+        penalty_weight (Optional[float]): The weight of the penalty term used in the "penalized_cross_entropy" loss.
+            This parameter is required and must be greater than 0
+        window_size (int): Determines the number of tokens each branch considers from the input sequence. This allows
+            for reducing the computational load by limiting the context size each branch processes.
+    Example:
+        config = BranchyModelConfig(
+            branch_locations=[2, 4, 6],
+            window_size=512
+        )
+    Note:
+        This configuration class is specifically designed for use with the BranchyModel class, enabling flexible
+        and customizable branching within transformer models.
+    """
+    model_type = "branchy"  # Optional, but useful for identifying the model type in the Transformers library
+    def __init__(
+        self,
+        model_str: str = None,
+        head_thresholds: Optional[List[float]] = None,
+        confidence_metric: Optional[str] = "breaking_ties",
+        branch_locations: Optional[List[int]] = None,
+        branch_number: Optional[int] = 3,
+        penalty_weight: Optional[float] = 0,
+        head_window_size: int = 512,
+        copy_lm_head: Optional[bool] = False,
+        **kwargs
+    ):
+        """
+        Initializes the BranchyModelConfig.
+        Args:
+            model_str (str): The model string to be used for the model. From Huggingface's model hub.
+            branch_locations (List[int], optional): Locations of the branches. Defaults to None, indicating no branches.
+            branch_number (Optional[int], optional): Number of branches if branch_locations is not provided. Defaults to 3.
+            penalty_weight (Optional[float], optional): Weight for the penalty in loss calculation.
+                . Defaults to None.
+            head_window_size (int, optional): Number of tokens each branch can see. Defaults to 512.
+        """
+        self.model_str = model_str
+        self.head_thresholds = head_thresholds
+        self.confidence_metric = confidence_metric
+        assert self.confidence_metric in ["breaking_ties", "max"], "confidence_metric must be 'breaking_ties' or 'max'. It should depend on how you found the thresholds."
+        self.branch_locations = branch_locations
+        self.penalty_weight = penalty_weight
+        self.head_window_size = head_window_size
+        if branch_locations is not None and branch_number is not None:
+            logger.warning("Both branch_locations and branch_number are provided. Using branch_locations.")
+        self.branch_number = branch_number if branch_locations is None else len(branch_locations)
+        self.copy_lm_head = copy_lm_head
+        #assert self.model_str is not None, "model_str must be provided."
+        assert self.branch_number > 0, "branch_number must be a positive integer."
+        assert isinstance(self.penalty_weight, float) or isinstance(self.penalty_weight, int), "penalty_weight must be a float or an integer."
+        assert self.penalty_weight >= 0 and self.penalty_weight <= 1, "penalty_weight must be in the range [0, 1]."
+        if branch_locations is not None:
+            assert all([isinstance(loc, int) for loc in self.branch_locations]), "Branch locations must be integers."
+            assert all([loc >= 0 for loc in self.branch_locations]), "Branch locations must be non-negative."
+        if self.head_window_size is not None:
+            assert self.head_window_size > 0 , "head_window_size must be a positive integer or None."
+        if type(self.head_thresholds) == list:
+            assert len(self.head_thresholds) == self.branch_number, "Number of thresholds must match number of branches."
+            assert all([isinstance(threshold, float) for threshold in self.head_thresholds]), "Thresholds must be floats."
+        super().__init__(**kwargs)  # Initialize with base class parameters

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "BranchyCausalModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "BranchyModelConfig.BranchyModelConfig",
+    "AutoModelForCausalLM": "BranchyModel.BranchyCausalModel"
+  },
+  "branch_locations": [
+    6,
+    12,
+    18,
+    24
+  ],
+  "branch_number": 4,
+  "confidence_metric": "breaking_ties",
+  "copy_lm_head": false,
+  "device": "cuda:0",
+  "head_thresholds": [
+    10.0,
+    10.0,
+    10.0,
+    10.0
+  ],
+  "head_window_size": 512,
+  "model_str": "microsoft/phi-2",
+  "model_type": "branchy",
+  "penalty_weight": 0.9,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.2"
+}

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cff2a27adc1e9a8965a31a8406a6bee8df4ea5bdf2df018a460218abba1ac64d
+size 4982357920

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b7879268f3bd6382559c9cbbb7d7252381329e068804ec6ebc6d21ee2995e5b
+size 4982544624

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3adcc60cfcc21b897661ad4e89202f22c88b47682f665c8aa664cb0f6a044edc
+size 3251942824

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,476 @@

+{
+  "metadata": {
+    "total_size": 13216788480
+  },
+  "weight_map": {
+    "model.branches.0.layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.branches.0.layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.branches.0.lm_head.bias": "model-00003-of-00003.safetensors",
+    "model.branches.0.lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.branches.1.layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.branches.1.layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.branches.1.lm_head.bias": "model-00003-of-00003.safetensors",
+    "model.branches.1.lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.branches.2.layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.branches.2.layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.branches.2.lm_head.bias": "model-00003-of-00003.safetensors",
+    "model.branches.2.lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.branches.3.layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.branches.3.layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.branches.3.lm_head.bias": "model-00003-of-00003.safetensors",
+    "model.branches.3.lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.model.lm_head.bias": "model-00003-of-00003.safetensors",
+    "model.model.lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.final_layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.final_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.0.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.14.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.14.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.14.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.14.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.14.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.14.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.14.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.2.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.20.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.input_layernorm.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.mlp.fc1.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.mlp.fc1.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.mlp.fc2.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.mlp.fc2.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.self_attn.dense.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.self_attn.dense.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.model.model.layers.3.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.30.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.self_attn.dense.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.self_attn.dense.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.input_layernorm.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.mlp.fc1.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.mlp.fc1.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.mlp.fc2.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.mlp.fc2.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.self_attn.dense.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.self_attn.dense.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.model.model.layers.4.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.input_layernorm.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.self_attn.dense.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.self_attn.dense.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
+    "model.model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors"
+  }
+}