| """ |
| MiniMind Max2 Model Configuration |
| Inspired by MiniMax M2's efficient activated parameters design |
| """ |
|
|
| from dataclasses import dataclass |
| from typing import Optional, Dict, Any |
|
|
|
|
| @dataclass |
| class Max2Config: |
| """Configuration for MiniMind Max2 models.""" |
|
|
| |
| model_name: str = "max2-lite" |
| model_version: str = "1.0.0" |
|
|
| |
| hidden_size: int = 1536 |
| intermediate_size: int = 4096 |
| num_hidden_layers: int = 24 |
| num_attention_heads: int = 12 |
| num_key_value_heads: int = 3 |
|
|
| |
| vocab_size: int = 32000 |
| max_position_embeddings: int = 8192 |
| rope_theta: float = 10000.0 |
|
|
| |
| use_moe: bool = True |
| num_experts: int = 8 |
| num_experts_per_tok: int = 2 |
| expert_hidden_size: int = 1024 |
| router_aux_loss_coef: float = 0.01 |
|
|
| |
| rms_norm_eps: float = 1e-6 |
| hidden_act: str = "silu" |
|
|
| |
| hidden_dropout: float = 0.0 |
| attention_dropout: float = 0.0 |
|
|
| |
| pad_token_id: int = 0 |
| bos_token_id: int = 1 |
| eos_token_id: int = 2 |
|
|
| |
| initializer_range: float = 0.02 |
|
|
| |
| use_cache: bool = True |
| use_flash_attention: bool = True |
| gradient_checkpointing: bool = False |
|
|
| def to_dict(self) -> Dict[str, Any]: |
| return {k: v for k, v in self.__dict__.items()} |
|
|
| @classmethod |
| def from_dict(cls, config_dict: Dict[str, Any]) -> "Max2Config": |
| return cls(**{k: v for k, v in config_dict.items() if k in cls.__dataclass_fields__}) |
|
|
|
|
| |
| MAX2_CONFIGS = { |
| "max2-nano": Max2Config( |
| model_name="max2-nano", |
| hidden_size=768, |
| intermediate_size=2048, |
| num_hidden_layers=12, |
| num_attention_heads=12, |
| num_key_value_heads=3, |
| num_experts=4, |
| num_experts_per_tok=1, |
| expert_hidden_size=512, |
| max_position_embeddings=4096, |
| ), |
| "max2-lite": Max2Config( |
| model_name="max2-lite", |
| hidden_size=1536, |
| intermediate_size=4096, |
| num_hidden_layers=24, |
| num_attention_heads=12, |
| num_key_value_heads=3, |
| num_experts=8, |
| num_experts_per_tok=2, |
| expert_hidden_size=1024, |
| max_position_embeddings=8192, |
| ), |
| "max2-pro": Max2Config( |
| model_name="max2-pro", |
| hidden_size=2560, |
| intermediate_size=6912, |
| num_hidden_layers=32, |
| num_attention_heads=20, |
| num_key_value_heads=4, |
| num_experts=8, |
| num_experts_per_tok=2, |
| expert_hidden_size=1728, |
| max_position_embeddings=16384, |
| ), |
| } |
|
|
| |
| Mind2Config = Max2Config |
| MIND2_CONFIGS = MAX2_CONFIGS |
|
|
|
|
| def get_config(model_name: str) -> Max2Config: |
| """Get predefined configuration by name.""" |
| if model_name not in MAX2_CONFIGS: |
| raise ValueError(f"Unknown model: {model_name}. Available: {list(MAX2_CONFIGS.keys())}") |
| return MAX2_CONFIGS[model_name] |
|
|
|
|
| def estimate_params(config: Max2Config) -> dict: |
| """Estimate parameter counts for a configuration.""" |
| embed_params = config.vocab_size * config.hidden_size |
| head_dim = config.hidden_size // config.num_attention_heads |
|
|
| |
| q_params = config.hidden_size * config.hidden_size |
| kv_params = 2 * config.hidden_size * (config.num_key_value_heads * head_dim) |
| o_params = config.hidden_size * config.hidden_size |
| attn_params_per_layer = q_params + kv_params + o_params |
|
|
| |
| if config.use_moe: |
| router_params = config.hidden_size * config.num_experts |
| expert_params = 3 * config.hidden_size * config.expert_hidden_size |
| ffn_params_per_layer = router_params + (config.num_experts * expert_params) |
| active_ffn_params = router_params + (config.num_experts_per_tok * expert_params) |
| else: |
| ffn_params_per_layer = 3 * config.hidden_size * config.intermediate_size |
| active_ffn_params = ffn_params_per_layer |
|
|
| norm_params_per_layer = 2 * config.hidden_size |
| layer_params = attn_params_per_layer + ffn_params_per_layer + norm_params_per_layer |
| active_layer_params = attn_params_per_layer + active_ffn_params + norm_params_per_layer |
|
|
| total_params = embed_params + (config.num_hidden_layers * layer_params) + embed_params |
| active_params = embed_params + (config.num_hidden_layers * active_layer_params) + embed_params |
|
|
| return { |
| "total_params": total_params, |
| "active_params": active_params, |
| "activation_ratio": active_params / total_params, |
| "total_params_b": total_params / 1e9, |
| "active_params_b": active_params / 1e9, |
| "estimated_size_fp16_gb": (total_params * 2) / (1024**3), |
| "estimated_size_int4_gb": (total_params * 0.5) / (1024**3), |
| } |
|
|