Upload model

Browse files

Files changed (4) hide show

config.json +32 -0
configuration_custom4.py +182 -0
modeling_custom4.py +56 -0
pytorch_model.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "EleutherAI/pythia-160m",
+  "architectures": [
+    "CustomModel4"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoModel": "modeling_custom4.CustomModel4"
+  },
+  "bos_token_id": 0,
+  "classifier_dropout": 0.1,
+  "eos_token_id": 0,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.0,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "rope_scaling": null,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.31.0",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}

configuration_custom4.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# coding=utf-8
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# """ GPTNeoX model configuration"""
+# from ...configuration_utils import PretrainedConfig
+# from ...utils import logging
+# logger = logging.get_logger(__name__)
+# GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+#     "EleutherAI/gpt-neox-20b": "https://huggingface.co/EleutherAI/gpt-neox-20b/resolve/main/config.json",
+#     # See all GPTNeoX models at https://huggingface.co/models?filter=gpt_neox
+# }
+# class GPTNeoXConfig(PretrainedConfig):
+    # r"""
+    # This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an
+    # GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    # with the defaults will yield a similar configuration to that of the GPTNeoX
+    # [EleutherAI/gpt-neox-20b](https://huggingface.co/EleutherAI/gpt-neox-20b) architecture.
+    # Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    # documentation from [`PretrainedConfig`] for more information.
+    # Args:
+    #     vocab_size (`int`, *optional*, defaults to 50432):
+    #         Vocabulary size of the GPTNeoX model. Defines the number of different tokens that can be represented by the
+    #         `inputs_ids` passed when calling [`GPTNeoXModel`].
+    #     hidden_size (`int`, *optional*, defaults to 6144):
+    #         Dimension of the encoder layers and the pooler layer.
+    #     num_hidden_layers (`int`, *optional*, defaults to 44):
+    #         Number of hidden layers in the Transformer encoder.
+    #     num_attention_heads (`int`, *optional*, defaults to 64):
+    #         Number of attention heads for each attention layer in the Transformer encoder.
+    #     intermediate_size (`int`, *optional*, defaults to 24576):
+    #         Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+    #     hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+    #         The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+    #         `"relu"`, `"selu"` and `"gelu_new"` are supported.
+    #     rotary_pct (`float`, *optional*, defaults to 0.25):
+    #         percentage of hidden dimensions to allocate to rotary embeddings
+    #     rotary_emb_base (`int`, *optional*, defaults to 10000)
+    #         base for computing rotary embeddings frequency
+    #     attention_dropout (`float`, *optional*, defaults to 0.0):
+    #         The dropout ratio probability of the attention score.
+    #     hidden_dropout (`float`, *optional*, defaults to 0.0):
+    #         The dropout ratio of (1) the word embeddings, (2) the post-attention hidden states, and (3) the post-mlp
+    #         hidden states.
+    #     classifier_dropout (`float`, *optional*, defaults to 0.1):
+    #         Argument used when doing token classification, used in the model [`GPTNeoXForTokenClassification`].
+    #         The dropout ratio for the hidden layer.
+    #     max_position_embeddings (`int`, *optional*, defaults to 2048):
+    #         The maximum sequence length that this model might ever be used with. Typically set this to something large
+    #         just in case (e.g., 512 or 1024 or 2048).
+    #     initializer_range (`float`, *optional*, defaults to 1e-5):
+    #         The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    #     layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+    #         The epsilon used by the layer normalization layers.
+    #     use_cache (`bool`, *optional*, defaults to `True`):
+    #         Whether or not the model should return the last key/values attentions (not used by all models). Only
+    #         relevant if `config.is_decoder=True`.
+    #     use_parallel_residual (`bool`, *optional*, defaults to `True`):
+    #         Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training
+    #         speedup at large scales (e.g. 20B).
+    #     rope_scaling (`Dict`, *optional*):
+    #         Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+    #         strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
+    #         is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+    #         `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+    #         these scaling strategies behave:
+    #         https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+    #         experimental feature, subject to breaking API changes in future versions.
+    #     Example:
+    # ```python
+    # >>> from transformers import GPTNeoXConfig, GPTNeoXModel
+    # >>> # Initializing a GPTNeoX gpt-neox-20b style configuration
+    # >>> configuration = GPTNeoXConfig()
+    # >>> # Initializing a model (with random weights) from the gpt-neox-20b style configuration
+    # >>> model = GPTNeoXModel(configuration)  # doctest: +SKIP
+    # >>> # Accessing the model configuration
+    # >>> configuration = model.config  # doctest: +SKIP
+    # ```"""
+    # model_type = "gpt_neox"
+from transformers import PretrainedConfig
+class CustomConfig4(PretrainedConfig):
+    model_type = "custom4"
+    def __init__(
+        self,
+        vocab_size=50432,
+        hidden_size=6144,
+        num_hidden_layers=44,
+        num_attention_heads=64,
+        intermediate_size=24576,
+        hidden_act="gelu",
+        rotary_pct=0.25,
+        rotary_emb_base=10000,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,
+        classifier_dropout=0.1,
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        use_parallel_residual=True,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.rotary_pct = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.use_parallel_residual = use_parallel_residual
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size is not divisble by the number of attention heads! Make sure to update them!"
+            )
+    # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")

modeling_custom4.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# https://huggingface.co/docs/transformers/custom_models
+from transformers import PreTrainedModel, AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from torch.nn.functional import log_softmax
+from torch.nn.modules.container import ModuleList
+from .configuration_custom4 import CustomConfig4
+class CustomModel4(PreTrainedModel):
+    config_class = CustomConfig4
+    def __init__(self, config):
+        super().__init__(config)
+    def forward(self, *args, labels=None, **kwargs):
+        loss = None
+        logits = None
+        for model, coeff in zip(self.models, self.coeffs):
+            logp = log_softmax(model.forward(*args, **kwargs).logits, dim=-1)
+            logits = coeff * logp if logits is None else logits + coeff * logp
+        # The rest copied from modeling_llama.py:
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+    @classmethod
+    def combine_models(cls, *args, coeffs = [], **kwargs):
+        models = []
+        for model in args:
+            models.append(AutoModelForCausalLM.from_pretrained(model, **kwargs).eval())
+        if coeffs == []:
+            coeffs = [1/len(args)] * len(args)
+        m = cls(models[0].config)
+        m.models = ModuleList(models)
+        m.coeffs = coeffs
+        return m
+CustomConfig4.register_for_auto_class()
+CustomModel4.register_for_auto_class('AutoModelForCausalLM')
+CustomModel4.register_for_auto_class('AutoModel')
+AutoConfig.register("custom4", CustomConfig4)
+AutoModel.register(CustomConfig4, CustomModel4)
+AutoModelForCausalLM.register(CustomConfig4, CustomModel4)

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ffa03b589263eccf2e09157196fab7b2abdaece84c8ed0f4b18f06540f48fd0
+size 465579541