GLM-4.7-Flash-SCM / configuration_glm_scm.py
ToastyPigeon's picture
Upload folder using huggingface_hub
24b5e5b verified
# Configuration for GLM-4.7-Flash ScatterMoE (SCM) variant
# Based on Glm4MoeLiteConfig from transformers
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class Glm4MoeLiteSCMConfig(PretrainedConfig):
model_type = "glm4_moe_lite"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=154880,
hidden_size=2048,
intermediate_size=10240,
moe_intermediate_size=1536,
num_hidden_layers=47,
num_attention_heads=20,
num_key_value_heads=20,
n_shared_experts=1,
n_routed_experts=64,
routed_scaling_factor=1.8,
kv_lora_rank=512,
q_lora_rank=768,
qk_rope_head_dim=64,
v_head_dim=256,
qk_nope_head_dim=192,
n_group=1,
topk_group=1,
num_experts_per_tok=4,
norm_topk_prob=True,
topk_method="noaux_tc",
first_k_dense_replace=1,
num_nextn_predict_layers=1,
hidden_act="silu",
max_position_embeddings=202752,
initializer_range=0.02,
rms_norm_eps=1e-5,
use_cache=True,
pad_token_id=None,
bos_token_id=0,
eos_token_id=1,
tie_word_embeddings=False,
rope_theta=1000000,
rope_scaling=None,
rope_interleave=True,
attention_bias=False,
attention_dropout=0.0,
scoring_func="sigmoid",
mlp_layer_types=None,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.moe_intermediate_size = moe_intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.n_shared_experts = n_shared_experts
self.n_routed_experts = n_routed_experts
self.routed_scaling_factor = routed_scaling_factor
self.kv_lora_rank = kv_lora_rank
self.q_lora_rank = q_lora_rank
self.qk_rope_head_dim = qk_rope_head_dim
self.v_head_dim = v_head_dim
self.qk_nope_head_dim = qk_nope_head_dim
self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
self.head_dim = qk_rope_head_dim # Used for RoPE computation
self.n_group = n_group
self.topk_group = topk_group
self.num_experts_per_tok = num_experts_per_tok
self.norm_topk_prob = norm_topk_prob
self.topk_method = topk_method
self.first_k_dense_replace = first_k_dense_replace
self.num_nextn_predict_layers = num_nextn_predict_layers
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.rope_interleave = rope_interleave
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.scoring_func = scoring_func
# MLP layer types: first layer is dense, rest are sparse (MoE)
if mlp_layer_types is not None:
self.mlp_layer_types = mlp_layer_types
else:
self.mlp_layer_types = (
["dense"] * first_k_dense_replace
+ ["sparse"] * (num_hidden_layers - first_k_dense_replace)
)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
__all__ = ["Glm4MoeLiteSCMConfig"]