from transformers import PretrainedConfig class QMoEConfig(PretrainedConfig): model_type = 'qmoe' def __init__(self, vocab_size=50257, d_model=768, num_layers=12, num_heads=16, max_seq_len=512, num_experts=8, moe_top_k=2, ffn_dim=2048, **kwargs): super().__init__(**kwargs) self.vocab_size = vocab_size self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.max_seq_len = max_seq_len self.num_experts = num_experts self.moe_top_k = moe_top_k self.ffn_dim = ffn_dim self.is_decoder = True self.add_cross_attention = False self.use_cache = False self.tie_word_embeddings = False