|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| """TensorMind model configuration"""
|
|
|
| try:
|
| from transformers.configuration_utils import PreTrainedConfig
|
| from transformers.modeling_rope_utils import RopeParameters
|
| except ImportError:
|
| from transformers.configuration_utils import PretrainedConfig as PreTrainedConfig
|
| RopeParameters = None
|
|
|
|
|
| class TensorMindConfig(PreTrainedConfig):
|
| model_type = "tensormind"
|
| keys_to_ignore_at_inference = ["past_key_values"]
|
| base_model_tp_plan = {
|
| "layers.*.self_attn.q_proj": "colwise",
|
| "layers.*.self_attn.k_proj": "colwise",
|
| "layers.*.self_attn.v_proj": "colwise",
|
| "layers.*.self_attn.o_proj": "rowwise",
|
| "layers.*.mlp.gate_proj": "colwise",
|
| "layers.*.mlp.up_proj": "colwise",
|
| "layers.*.mlp.down_proj": "rowwise",
|
| }
|
| base_model_pp_plan = {
|
| "embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
| "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
| "norm": (["hidden_states"], ["hidden_states"]),
|
| }
|
|
|
| def __init__(
|
| self,
|
| vocab_size: int | None = 32768,
|
| hidden_size: int | None = 1024,
|
| intermediate_size: int | None = 4096,
|
| num_hidden_layers: int | None = 32,
|
| num_attention_heads: int | None = 16,
|
| num_key_value_heads: int | None = 8,
|
| hidden_act: str | None = "silu",
|
| max_position_embeddings: int | None = 32768,
|
| initializer_range: float | None = 0.02,
|
| rms_norm_eps: int | None = 1e-6,
|
| use_cache: bool | None = True,
|
| tie_word_embeddings: bool | None = True,
|
| attention_bias: bool | None = False,
|
| attention_dropout: float | None = 0.0,
|
| pad_token_id: int | None = None,
|
| bos_token_id: int | None = None,
|
| eos_token_id: int | None = None,
|
| rope_parameters: RopeParameters | dict[str, RopeParameters] | None = {
|
| "rope_type": "default",
|
| "rope_theta": 10000.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
| },
|
| **kwargs,
|
| ):
|
| self.vocab_size = vocab_size
|
| self.max_position_embeddings = max_position_embeddings
|
| self.hidden_size = hidden_size
|
| self.intermediate_size = intermediate_size
|
| self.num_hidden_layers = num_hidden_layers
|
| self.num_attention_heads = num_attention_heads
|
| self.num_key_value_heads = num_key_value_heads
|
| self.hidden_act = hidden_act
|
| self.initializer_range = initializer_range
|
| self.rms_norm_eps = rms_norm_eps
|
| self.use_cache = use_cache
|
| self.attention_bias = attention_bias
|
| self.attention_dropout = attention_dropout
|
| self.rope_parameters = rope_parameters
|
| self.tie_word_embeddings = tie_word_embeddings
|
| self.pad_token_id = pad_token_id
|
| self.bos_token_id = bos_token_id
|
| self.eos_token_id = eos_token_id
|
|
|
| self._attn_implementation = "sdpa"
|
| super().__init__(**kwargs)
|
|
|
|
|
| __all__ = ["TensorMindConfig"]
|
|
|