ymodel3-n1 / configuration_ymodel3.py
SnifferCaptain's picture
Upload 7 files
47afa41 verified
from transformers import PretrainedConfig
class YConfig3(PretrainedConfig):
model_type = "ynet3"
def __init__(self, **kwargs):
self.dropout = kwargs.pop("dropout", 0.0)
self.bos_token_id = kwargs.pop("bos_token_id", 151644)
self.eos_token_id = kwargs.pop("eos_token_id", 151645)
self.pad_token_id = kwargs.pop("pad_token_id", 151643)
self.hidden_act = kwargs.pop("hidden_act", "silu")
self.hidden_size = kwargs.pop("hidden_size", 768)
self.num_hidden_layers = kwargs.pop("num_hidden_layers", 8)
self.max_position_embeddings = kwargs.pop("max_position_embeddings", 8192)
self.vocab_size = kwargs.pop("vocab_size", 6400)
self.rms_norm_eps = kwargs.pop("rms_norm_eps", 1e-6)
self.rope_theta = kwargs.pop("rope_theta", 5e4)
self.rope_scaling = kwargs.pop("rope_scaling", None)
self.dtype = kwargs.pop("dtype", "float32")
self.self_distill = kwargs.pop("self_distill", True)
self.intermediate_size = kwargs.pop("intermediate_size", 1536)
self.expert_intermediate_size = kwargs.pop("expert_intermediate_size", None) or self.intermediate_size
self.n_routed_experts = kwargs.pop("n_routed_experts", 0)
self.moe_topk = kwargs.pop("moe_topk", 2)
self.score_func = kwargs.pop("score_func", "softmax")
self.n_shared_experts = kwargs.pop("n_shared_experts", 0)
self.top_k_layer_dense = kwargs.pop("top_k_layer_dense", 1)
self.aux_loss_alpha = kwargs.pop("aux_loss_alpha", 0.02)
self.seq_aux = kwargs.pop("seq_aux", False)
self.norm_topk_prob = kwargs.pop("norm_topk_prob", True)
self.noisy_expert = kwargs.pop("noisy_expert", 0.0)
self.moe_backend = kwargs.pop("moe_backend", "compact")
self.router_bias_enabled = kwargs.pop("router_bias_enabled", True)
self.router_bias_update_rate = kwargs.pop("router_bias_update_rate", 1e-3)
self.router_bias_clamp = kwargs.pop("router_bias_clamp", 5.0)
self.num_heads = kwargs.pop("num_heads", 12)
self.mla_kv_lora_rank = kwargs.pop("mla_kv_lora_rank", 64)
self.mla_qk_nope_head_dim = kwargs.pop("mla_qk_nope_head_dim", 64)
self.mla_qk_rope_head_dim = kwargs.pop("mla_qk_rope_head_dim", 32)
self.mla_attn_impl = kwargs.pop("mla_attn_impl", "absorb")
self.qkv_lora = kwargs.pop("qkv_lora", False)
super().__init__(
bos_token_id=self.bos_token_id,
eos_token_id=self.eos_token_id,
pad_token_id=self.pad_token_id,
**kwargs,
)