Upload FalconH1MoEForCausalLM

Browse files

Files changed (8) hide show

README.md +199 -0
config.json +70 -0
configuration_falcon_h1_moe.py +17 -0
generation_config.json +7 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_falcon_h1_moe.py +186 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "architectures": [
+    "FalconH1MoEForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_in_multiplier": 1.0,
+  "attention_out_multiplier": 0.9375,
+  "attn_layer_indices": null,
+  "auto_map": {
+    "AutoConfig": "configuration_falcon_h1_moe.FalconH1MoEConfig",
+    "AutoModel": "modeling_falcon_h1_moe.FalconH1MoEForCausalLM"
+  },
+  "bos_token_id": 1,
+  "embedding_multiplier": 5.656854249492381,
+  "eos_token_id": 11,
+  "expert_num": 8,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "key_multiplier": 0.39062499999999994,
+  "lm_head_multiplier": 0.0390625,
+  "mamba_chunk_size": 128,
+  "mamba_conv_bias": true,
+  "mamba_d_conv": 4,
+  "mamba_d_head": 64,
+  "mamba_d_ssm": 1536,
+  "mamba_d_state": 128,
+  "mamba_expand": 2,
+  "mamba_n_groups": 1,
+  "mamba_n_heads": 24,
+  "mamba_norm_before_gate": false,
+  "mamba_proj_bias": false,
+  "mamba_rms_norm": false,
+  "mamba_use_mlp": true,
+  "max_position_embeddings": 16384,
+  "mlp_bias": false,
+  "mlp_expansion_factor": 8,
+  "mlp_multipliers": [
+    0.8838834764831844,
+    0.5859375
+  ],
+  "model_type": "falcon_h1",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 2,
+  "num_logits_to_keep": 1,
+  "pad_token_id": 0,
+  "projectors_bias": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 100000000000.0,
+  "ssm_in_multiplier": 1.25,
+  "ssm_multipliers": [
+    0.3535533905932738,
+    0.25,
+    0.3535533905932738,
+    0.5,
+    0.3535533905932738
+  ],
+  "ssm_out_multiplier": 0.23570226039551587,
+  "tie_word_embeddings": false,
+  "topk": 2,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.2",
+  "use_cache": true,
+  "vocab_size": 32784
+}

configuration_falcon_h1_moe.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from transformers import FalconH1Config
+"""FalconH1MoE model configuration"""
+class FalconH1MoEConfig(FalconH1Config):
+    def __init__(
+        self,
+        expert_num=8,
+        topk=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.expert_num = expert_num
+        self.topk = topk
+__all__ = ["FalconH1MoEConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 11,
+  "pad_token_id": 0,
+  "transformers_version": "4.55.2"
+}

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8511f8dcbafd15cc0a878e826192af5f50ee1622047e24645027d29cf4157474
+size 4995103432

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6a2cf8841920aed4698fd782c5750be0209e2c6d2a82aee56e23a6489f6cac6
+size 3433677328

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_falcon_h1_moe.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+import torch
+from transformers import FalconH1Config, FalconH1ForCausalLM, FalconH1Model
+from openrlhf.moe_utils import FalconH1MoEConfig
+from  transformers.models.falcon_h1.modeling_falcon_h1 import FalconH1DecoderLayer, FalconH1MLP, compute_mup_vector
+from torch import nn
+import random
+import numpy as np
+class FalconH1MoEModel(FalconH1Model):
+    def __init__(self, config: FalconH1MoEConfig):
+        super().__init__(config)
+        decoder_layers = []
+        for i in range(config.num_hidden_layers):
+            decoder_layers.append(FalconH1MoEDecoderLayer(config, layer_idx=i))
+        self.layers = nn.ModuleList(decoder_layers)
+        mup_vector = compute_mup_vector(config)
+        for layer in self.layers:
+            layer.mamba.register_buffer("mup_vector", mup_vector, persistent=False)
+class FalconH1MoEMLP(nn.Module):
+    def __init__(self, config: FalconH1MoEConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.entropy = []
+        self.num_local_experts = config.expert_num
+        self.topk=config.topk
+        '''build experts'''
+        self.experts = torch.nn.ModuleList()
+        for _ in range(self.num_local_experts):
+            expert = FalconH1MLP(config)
+            self.experts.append(expert)
+        '''build router'''
+        self.weight = torch.nn.Parameter(
+            torch.empty((self.num_local_experts, self.config.hidden_size), dtype=torch.float32)
+        )
+        torch.nn.init.xavier_uniform_(self.weight)
+    def forward(self, x):
+        log_str = ""
+        x = x.transpose(0, 1).contiguous() #x: [seq_len, bs, hidden_size]
+        '''fixed parameters'''
+        inp_shape = x.shape
+        num_tokens = inp_shape[0] * inp_shape[1]
+        hidden = inp_shape[-1]
+        num_experts = self.num_local_experts
+        x = x.view(-1, inp_shape[-1]) #x: [token_num, hidden_size]
+        restore_shape = x.shape
+        """Routing , compute the experts' weight for each token, all following step is on token level.
+        Args:
+            input (torch.Tensor): Input tensor of shape [bs, seq, hidden].
+            weights (torch.Tensor): router's weights, [hidden, expert_num].
+        Returns:
+            routing_probs, token -> expert_prob
+            [[0.0000, 0.0000, 0.4006, 0.5994],
+            ...,
+            [0.0373, 0.0000, 0.9627, 0.0000]]
+            ------------
+            routing_map, token -> expert_idx
+            [[False, False,  True,  True],
+            ...,
+            [ True, False,  True, False]])
+        """
+        y = torch.mm(x, self.weight.to(x.dtype).t()) #y: [token_num, expert_num]
+        scores, top_indices = torch.topk(y, k=self.topk, dim=1)
+        probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(y)
+        routing_probs = torch.zeros_like(y).scatter(1, top_indices, probs)
+        routing_map = torch.zeros_like(y).int().scatter(1, top_indices, 1).bool()
+        """Dispatch: experts-to-tokens
+        Args:
+        Returns:
+            probs: [expert0{token4_prob, token2_prob,token8_prob}.....expertn]
+            x: [expert0{token4_idx, token2_idx, token8_idx}.....]
+        """
+        permuted_probs = None
+        num_local_tokens_per_expert = routing_map.sum(dim=0).long() # [token_num_e_1, ...., token_num_e_n]
+        num_out_tokens = routing_map.size(0) * self.topk
+        routing_map = routing_map.bool().T.contiguous() # expert-to-token, [expert_num, token_num]
+        '''
+        [False, False, False,  ..., False,  True,  True],
+        [False, False, False,  ...,  True, False, False],
+        [ True,  True,  True,  ...,  True,  True,  True],
+        [ True,  True,  True,  ..., False, False, False]]
+        '''
+        token_indices = (
+            torch.arange(num_tokens, device=routing_map.device).unsqueeze(0).expand(num_experts, -1)
+        ) # [expert_num, token_num]
+        '''
+        [[   0,    1,    2,  ..., 1021, 1022, 1023],
+        [   0,    1,    2,  ..., 1021, 1022, 1023],
+        [   0,    1,    2,  ..., 1021, 1022, 1023],
+        [   0,    1,    2,  ..., 1021, 1022, 1023]]
+        '''
+        sorted_indices = token_indices.masked_select(routing_map) # [topk * token_num]
+        '''
+        [   8,    9,   12,  ..., 1015, 1016, 1017],
+        sorted_indices[:idx_1]->expert0
+        sorted_indices[idx_1:idx_2]->expert1
+        sorted_indices[idx_2:idx_3]->expert2
+        sorted_indices[idx_3:idx_4]->expert3
+        '''
+        probs = routing_probs.T.contiguous().masked_select(routing_map)  # [topk * token_num]
+        '''
+        [0.6458, 0.6458, 0.5577,  ..., 0.4983, 0.0520, 0.0520]
+        '''
+        x = x.index_select(0, sorted_indices) # [token_num * topk, hidden]
+        """compute:
+        Args:
+        Returns:
+        """
+        tokens_list = torch.split(x, num_local_tokens_per_expert.tolist())
+        probs_list = torch.split(probs, num_local_tokens_per_expert.tolist())
+        output_local_list = []
+        self.entropy = []
+        for expert, tokens, prob in zip(self.experts, tokens_list, probs_list):
+            output = expert(tokens) * prob.unsqueeze(-1)
+            pd = torch.nn.functional.softmax(output, dim=-1)
+            entropy = torch.logsumexp(output, dim=-1) - torch.sum(pd * output, dim=-1)
+            entropy_mean = entropy.mean(dim=0).item()
+            # print(f"*******layer_idx: {str(self.layer_idx)}, entropy_loss: {str(entropy.mean(dim=0).item())}, token_selected: {str(tokens.shape[0])}")
+            output_local_list.append(output)
+            self.entropy.append((entropy_mean, tokens.shape[0]))
+        permuted_tokens = torch.cat(output_local_list, dim=0)
+        output_tokens = torch.zeros(
+        restore_shape, dtype=permuted_tokens.dtype, device=permuted_tokens.device
+            )
+        # Scatter add the permuted_input back to the original positions
+        output_tokens.scatter_add_(0, sorted_indices.unsqueeze(1).expand(-1, hidden), permuted_tokens)
+        output = output_tokens.view(inp_shape).transpose(0, 1)
+        return output
+class FalconH1MoEDecoderLayer(FalconH1DecoderLayer):
+    def __init__(self, config: FalconH1MoEConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.feed_forward = FalconH1MoEMLP(config, layer_idx)
+class FalconH1MoEForCausalLM(FalconH1ForCausalLM):
+    def __init__(self, config: FalconH1MoEConfig):
+        super().__init__(config)
+        self.model = FalconH1MoEModel(config)
+__all__ = ["FalconH1MoEForCausalLM"]