tinycompany
/

BiBo-MoE-Tiny

Model card Files Files and versions

xet

Community

fhai50032 commited on Apr 25, 2025

Commit

94c09b9

verified ·

1 Parent(s): e71f236

Create configuration_bibo.py

Browse files

Files changed (1) hide show

configuration_bibo.py +164 -0

configuration_bibo.py ADDED Viewed

	@@ -0,0 +1,164 @@

+# coding=utf-8
+# Copyright 2024 The BiBo Authors and The HuggingFace Inc. team. All rights reserved.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+BIBO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+   # not now
+}
+class BiBoConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BiBoModel`]. It is used to
+    instantiate a BiBo model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read
+    the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 128000):
+            Vocabulary size of the BiBo model.
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimension of the hidden states.
+        intermediate_size (`int`, *optional*, defaults to 8960):
+            Dimension of the MLP representations in Dense layers.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            Number of key and value heads for Grouped Query Attention.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the RMS normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions.
+        pad_token_id (`int`, *optional*):
+             The index of the padding token in the vocabulary. Defaults to None.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sequence token in the vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie the weights of the input embeddings and the output embeddings.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for RoPE embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 32768):
+            Sliding window attention window size. If not specified, will default to `config.max_position_embeddings`.
+        max_window_layers (`int`, *optional*, defaults to 21):
+             The number of layers that use sliding window attention.
+        # --- MoE Specific Parameters ---
+        moe_intermediate_size (`int`, *optional*, defaults to 1024):
+             Dimension of the MLP representations in MoE layers.
+        num_routed_experts (`int`, *optional*, defaults to 11):
+             Total number of routed experts (MLP + Identity) in MoE layers.
+        num_shared_experts (`int`, *optional*, defaults to 1):
+             Total number of shared experts (Convolutional) in MoE layers.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+             Number of routed experts to select per token (Top-K).
+        # --- Hybrid Layer Control ---
+        # Implicitly defined: First (idx=0) and Last (idx=N-1) layers are Dense, others are MoE.
+    """
+    model_type = "bibo"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=128000,
+        hidden_size=1536,
+        intermediate_size=8960,
+        num_hidden_layers=28,
+        num_attention_heads=12,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-06,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=0,
+        eos_token_id=0,
+        tie_word_embeddings=True,
+        rope_theta=1000000.0,
+        rope_scaling=None,
+        attention_dropout=0.0,
+        use_sliding_window=False,
+        sliding_window=32768,
+        max_window_layers=21,
+        # MoE defaults
+        moe_intermediate_size=1024,
+        num_routed_experts=11,
+        num_shared_experts=1,
+        num_experts_per_tok=2,
+        router_temperature=1.3,
+        bias_update_factor=1e-4,
+        router_noise=0.5,
+        kernel_size=3
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # MoE parameters
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_routed_experts = num_routed_experts
+        self.num_shared_experts = num_shared_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.router_temperature = router_temperature
+        self.router_noise = router_noise
+        self.bias_update_factor = bias_update_factor
+        self.kernel_size = kernel_size
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else self.max_position_embeddings
+        self.max_window_layers = max_window_layers
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+# from transformers import AutoConfig
+# AutoConfig.register("bibo", BiBoConfig)