Spaces:

miyuki2026
/

OpenMiniMind

Sleeping

App Files Files Community

OpenMiniMind / toolbox /minimind /model /configuration_minimind.py

miyuki2026

update

df0647f 3 months ago

raw

history blame contribute delete

3.29 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	from transformers import PretrainedConfig


	class MiniMindConfig(PretrainedConfig):
	model_type = "minimind"

	def __init__(
	self,
	dropout: float = 0.0,
	bos_token_id: int = 1,
	eos_token_id: int = 2,
	hidden_act: str = 'silu',
	hidden_size: int = 512,
	intermediate_size: int = None,
	max_position_embeddings: int = 32768,
	num_attention_heads: int = 8,
	num_hidden_layers: int = 8,
	num_key_value_heads: int = 2,
	vocab_size: int = 6400,
	rms_norm_eps: float = 1e-05,
	rope_theta: int = 1000000.0,
	inference_rope_scaling: bool = False,
	flash_attn: bool = True,
	####################################################
	# Here are the specific configurations of MOE
	# When use_moe is false, the following is invalid
	####################################################
	use_moe: bool = False,
	num_experts_per_tok: int = 2,
	n_routed_experts: int = 4,
	n_shared_experts: int = 1,
	scoring_func: str = 'softmax',
	aux_loss_alpha: float = 0.01,
	seq_aux: bool = True,
	norm_topk_prob: bool = True,
	**kwargs
	):
	super().__init__(**kwargs)
	self.dropout = dropout
	self.bos_token_id = bos_token_id
	self.eos_token_id = eos_token_id
	self.hidden_act = hidden_act
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.max_position_embeddings = max_position_embeddings
	self.num_attention_heads = num_attention_heads
	self.num_hidden_layers = num_hidden_layers
	self.num_key_value_heads = num_key_value_heads
	self.vocab_size = vocab_size
	self.rms_norm_eps = rms_norm_eps
	self.rope_theta = rope_theta
	self.inference_rope_scaling = inference_rope_scaling
	# 外推长度 = factor * original_max_position_embeddings = 32768
	self.rope_scaling = {
	"beta_fast": 32,
	"beta_slow": 1,
	"factor": 16,
	"original_max_position_embeddings": 2048,
	"attention_factor": 1.0,
	"type": "yarn"
	} if self.inference_rope_scaling else None
	self.flash_attn = flash_attn
	####################################################
	# Here are the specific configurations of MOE
	# When use_moe is false, the following is invalid
	####################################################
	self.use_moe = use_moe
	self.num_experts_per_tok = num_experts_per_tok # 每个token选择的专家数量
	self.n_routed_experts = n_routed_experts # 总的专家数量
	self.n_shared_experts = n_shared_experts # 共享专家
	self.scoring_func = scoring_func # 评分函数，默认为'softmax'
	self.aux_loss_alpha = aux_loss_alpha # 辅助损失的alpha参数
	self.seq_aux = seq_aux # 是否在序列级别上计算辅助损失
	self.norm_topk_prob = norm_topk_prob # 是否标准化top-k概率


	if __name__ == "__main__":
	pass