#!/usr/bin/env python3 """ AETHER-Micro Configuration (Hugging Face Standard) 기존 AETHER-Micro의 모든 하이퍼파라미터를 HF PretrainedConfig 형식으로 정의합니다. Wu-Xing, Heterogeneous MoE, Latent Thought Loop, RLP 등 모든 커스텀 기능을 보존합니다. """ from transformers import PretrainedConfig from typing import Optional class AETHERMicroConfig(PretrainedConfig): """ AETHER-Micro 모델 설정 클래스 Hugging Face PretrainedConfig를 상속하여 AETHER-Micro의 모든 하이퍼파라미터를 정의합니다. 이 설정은 Unsloth와 완전히 호환됩니다. Args: vocab_size (int, optional, defaults to 64000): 어휘 크기 (64K tokenizer 기준) hidden_size (int, optional, defaults to 896): 히든 레이어 차원 (14의 배수로 최적화) intermediate_size (int, optional, defaults to 3584): FFN 중간 레이어 차원 (hidden_size × 4) num_hidden_layers (int, optional, defaults to 24): Transformer 레이어 개수 num_attention_heads (int, optional, defaults to 14): Multi-Head Attention 헤드 개수 num_key_value_heads (int, optional, defaults to 14): GQA (Grouped Query Attention) K/V 헤드 개수 max_position_embeddings (int, optional, defaults to 8192): 최대 시퀀스 길이 rms_norm_eps (float, optional, defaults to 1e-6): RMSNorm epsilon 값 rope_theta (float, optional, defaults to 10000.0): RoPE (Rotary Position Embedding) theta 값 attention_dropout (float, optional, defaults to 0.0): Attention 드롭아웃 비율 # ======================================== # Heterogeneous MoE Settings # ======================================== enable_hetero_moe (bool, optional, defaults to True): Heterogeneous MoE 활성화 여부 num_大_experts (int, optional, defaults to 5): 大型 Expert 개수 (2048 dim, 금/수/목/화/토 Agent 각 1개 제어) num_小_experts (int, optional, defaults to 15): 小型 Expert 개수 (1024 dim, 각 Agent가 3개씩 제어) num_shared_experts (int, optional, defaults to 2): 공유 Expert 개수 (1536 dim, 항상 활성화) top_k (int, optional, defaults to 2): MoE Top-K 라우팅 개수 # ======================================== # Wu-Xing Router Settings # ======================================== enable_wuxing (bool, optional, defaults to True): Wu-Xing (오행) 라우터 활성화 여부 enable_magic_init (bool, optional, defaults to True): Magic Square 초기화 활성화 여부 # ======================================== # Wu-Xing Annealing Settings # ======================================== enable_annealing (bool, optional, defaults to True): Wu-Xing Annealing Scheduler 활성화 여부 alpha_start (float, optional, defaults to 0.5): Magic Square 초기 가중치 (warmup phase) beta_start (float, optional, defaults to 0.3): Learned Matrix 초기 가중치 (warmup phase) gamma_start (float, optional, defaults to 0.2): History Buffer 초기 가중치 (warmup phase) alpha_end (float, optional, defaults to 0.1): Magic Square 최종 가중치 (mature phase) beta_end (float, optional, defaults to 0.2): Learned Matrix 최종 가중치 (mature phase) gamma_end (float, optional, defaults to 0.7): History Buffer 최종 가중치 (mature phase) # ======================================== # Latent Thought Loop Settings # ======================================== enable_latent_thought (bool, optional, defaults to True): Latent Thought Loop 활성화 여부 num_latents (int, optional, defaults to 8): Latent token 개수 latent_dim (int, optional, defaults to 512): Latent 차원 (hidden_size보다 작음) max_k (int, optional, defaults to 2): 최대 사고 깊이 (K=0: direct, K=1: shallow, K=2: deep) # ======================================== # RLP (Reinforcement as Pretraining) Settings # ======================================== enable_rlp (bool, optional, defaults to False): RLP 훈련 활성화 여부 (기본 비활성화, 훈련 시 명시적 활성화) rlp_quality_weight (float, optional, defaults to 1.0): RLP Quality 가중치 rlp_info_gain_clip (float, optional, defaults to 5.0): Information Gain 클리핑 범위 rlp_ntp_weight (float, optional, defaults to 0.7): NTP (Next Token Prediction) 손실 가중치 rlp_target_reward_weight (float, optional, defaults to 0.3): Target RLP 보상 가중치 (annealing으로 0 → 0.3) rlp_warmup_steps (int, optional, defaults to 1500): RLP 보상 가중치 warmup 스텝 (전체의 10%) # ======================================== # Self-Evaluation Head Settings # ======================================== enable_self_eval (bool, optional, defaults to True): Self-Evaluation Head 활성화 여부 (RLP 훈련 시 필요) self_eval_dims (int, optional, defaults to 4): Quality 차원 개수 (factuality, coherence, completeness, specificity) Example: ```python from transformers import AutoConfig # 기본 설정으로 생성 config = AETHERMicroConfig() # 커스텀 설정 config = AETHERMicroConfig( vocab_size=64000, hidden_size=896, num_hidden_layers=24, enable_wuxing=True, enable_hetero_moe=True, enable_latent_thought=True, enable_rlp=False ) # 저장 및 로드 config.save_pretrained("./aether-micro-config") config = AutoConfig.from_pretrained("./aether-micro-config") ``` """ model_type = "aether_micro" def __init__( self, # ======================================== # Base Architecture # ======================================== vocab_size: int = 64000, hidden_size: int = 1024, # 0.5B 검증 전략: 896 → 1024 intermediate_size: int = 4096, # hidden_size × 4 num_hidden_layers: int = 24, num_attention_heads: int = 16, # 0.5B 검증 전략: 14 → 16 num_key_value_heads: int = 4, # GQA-4 (0.5B 검증 전략) max_position_embeddings: int = 8192, rms_norm_eps: float = 1e-6, rope_theta: float = 10000.0, attention_dropout: float = 0.0, # ======================================== # Tokenizer & Generation # ======================================== use_cache: bool = True, pad_token_id: int = 0, bos_token_id: int = 1, eos_token_id: int = 2, tie_word_embeddings: bool = False, # ======================================== # Heterogeneous MoE # ======================================== enable_hetero_moe: bool = True, num_大_experts: int = 5, num_小_experts: int = 15, num_shared_experts: int = 2, 大_intermediate_size: int = 2048, 小_intermediate_size: int = 1024, shared_intermediate_size: int = 1536, top_k: int = 2, num_experts_per_tok: int = 2, # ======================================== # Wu-Xing Router # ======================================== enable_wuxing: bool = True, enable_magic_init: bool = True, # ======================================== # Wu-Xing Annealing # ======================================== enable_annealing: bool = True, alpha_start: float = 0.5, beta_start: float = 0.3, gamma_start: float = 0.2, alpha_end: float = 0.1, beta_end: float = 0.2, gamma_end: float = 0.7, # ======================================== # Latent Thought Loop # ======================================== enable_latent_thought: bool = True, num_latents: int = 8, latent_dim: int = 512, max_k: int = 2, # ======================================== # RLP (Reinforcement as Pretraining) # ======================================== enable_rlp: bool = False, rlp_quality_weight: float = 1.0, rlp_info_gain_clip: float = 5.0, rlp_ntp_weight: float = 0.7, rlp_target_reward_weight: float = 0.3, rlp_warmup_steps: int = 1500, # ======================================== # Quality Head (Block 3) # ======================================== enable_quality_head: bool = True, quality_head_dim: int = 4, # Coherence, Relevance, Specificity, Helpfulness # ======================================== # Self-Evaluation Head # ======================================== enable_self_eval: bool = True, self_eval_dims: int = 4, # ======================================== # MTP Loss (Multi-Token Prediction) (Block 5) # ======================================== enable_mtp_loss: bool = True, mtp_num_predictions: int = 4, # Predict next 4 tokens # Magic Square Data Scheduler (Block 6) # ======================================== enable_magic_square: bool = True, **kwargs ): # Base architecture self.vocab_size = vocab_size self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.max_position_embeddings = max_position_embeddings self.rms_norm_eps = rms_norm_eps self.rope_theta = rope_theta self.attention_dropout = attention_dropout # Tokenizer & Generation self.use_cache = use_cache self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.tie_word_embeddings = tie_word_embeddings # Heterogeneous MoE self.enable_hetero_moe = enable_hetero_moe self.num_大_experts = num_大_experts self.num_小_experts = num_小_experts self.num_shared_experts = num_shared_experts self.大_intermediate_size = 大_intermediate_size self.小_intermediate_size = 小_intermediate_size self.shared_intermediate_size = shared_intermediate_size self.top_k = top_k self.num_experts_per_tok = num_experts_per_tok # Wu-Xing Router self.enable_wuxing = enable_wuxing self.enable_magic_init = enable_magic_init # Wu-Xing Annealing self.enable_annealing = enable_annealing self.alpha_start = alpha_start self.beta_start = beta_start self.gamma_start = gamma_start self.alpha_end = alpha_end self.beta_end = beta_end self.gamma_end = gamma_end # Latent Thought Loop self.enable_latent_thought = enable_latent_thought self.num_latents = num_latents self.latent_dim = latent_dim self.max_k = max_k # RLP self.enable_rlp = enable_rlp self.rlp_quality_weight = rlp_quality_weight self.rlp_info_gain_clip = rlp_info_gain_clip self.rlp_ntp_weight = rlp_ntp_weight self.rlp_target_reward_weight = rlp_target_reward_weight self.rlp_warmup_steps = rlp_warmup_steps # Quality Head (Block 3) self.enable_quality_head = enable_quality_head self.quality_head_dim = quality_head_dim # Self-Evaluation Head self.enable_self_eval = enable_self_eval self.self_eval_dims = self_eval_dims # MTP Loss (Block 5) self.enable_mtp_loss = enable_mtp_loss self.mtp_num_predictions = mtp_num_predictions # Magic Square Data Scheduler (Block 6) self.enable_magic_square = enable_magic_square # Remove our custom params from kwargs before calling parent kwargs_copy = kwargs.copy() for key in ['enable_hetero_moe', 'num_大_experts', 'num_小_experts', 'num_shared_experts', '大_intermediate_size', '小_intermediate_size', 'shared_intermediate_size', 'top_k', 'num_experts_per_tok', 'enable_wuxing', 'enable_magic_init', 'enable_annealing', 'alpha_start', 'beta_start', 'gamma_start', 'alpha_end', 'beta_end', 'gamma_end', 'enable_latent_thought', 'num_latents', 'latent_dim', 'max_k', 'enable_rlp', 'rlp_quality_weight', 'enable_quality_head', 'quality_head_dim', 'rlp_info_gain_clip', 'rlp_ntp_weight', 'rlp_target_reward_weight', 'rlp_warmup_steps', 'enable_self_eval', 'self_eval_dims', 'enable_mtp_loss', 'mtp_num_predictions', 'enable_magic_square']: kwargs_copy.pop(key, None) # Call parent constructor super().__init__(**kwargs_copy) # Re-assign our custom params after parent init (to prevent override) self.vocab_size = vocab_size self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.max_position_embeddings = max_position_embeddings self.rms_norm_eps = rms_norm_eps self.rope_theta = rope_theta self.attention_dropout = attention_dropout self.use_cache = use_cache self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.tie_word_embeddings = tie_word_embeddings self.enable_hetero_moe = enable_hetero_moe self.num_大_experts = num_大_experts self.num_小_experts = num_小_experts self.num_shared_experts = num_shared_experts self.大_intermediate_size = 大_intermediate_size self.小_intermediate_size = 小_intermediate_size self.shared_intermediate_size = shared_intermediate_size self.top_k = top_k self.num_experts_per_tok = num_experts_per_tok self.enable_wuxing = enable_wuxing self.enable_magic_init = enable_magic_init self.enable_annealing = enable_annealing self.alpha_start = alpha_start self.beta_start = beta_start self.gamma_start = gamma_start self.alpha_end = alpha_end self.beta_end = beta_end self.gamma_end = gamma_end self.enable_latent_thought = enable_latent_thought self.num_latents = num_latents self.latent_dim = latent_dim self.max_k = max_k self.enable_rlp = enable_rlp self.rlp_quality_weight = rlp_quality_weight self.rlp_info_gain_clip = rlp_info_gain_clip self.rlp_ntp_weight = rlp_ntp_weight self.rlp_target_reward_weight = rlp_target_reward_weight self.rlp_warmup_steps = rlp_warmup_steps self.enable_self_eval = enable_self_eval self.self_eval_dims = self_eval_dims def to_dict(self): """ 설정을 dict로 변환 (저장 시 사용) """ output = super().to_dict() # 모든 AETHER-Micro 파라미터 추가 output.update({ # Base "vocab_size": self.vocab_size, "hidden_size": self.hidden_size, "intermediate_size": self.intermediate_size, "num_hidden_layers": self.num_hidden_layers, "num_attention_heads": self.num_attention_heads, "num_key_value_heads": self.num_key_value_heads, "max_position_embeddings": self.max_position_embeddings, "rms_norm_eps": self.rms_norm_eps, "rope_theta": self.rope_theta, "attention_dropout": self.attention_dropout, # Tokenizer & Generation "use_cache": self.use_cache, "pad_token_id": self.pad_token_id, "bos_token_id": self.bos_token_id, "eos_token_id": self.eos_token_id, "tie_word_embeddings": self.tie_word_embeddings, # Heterogeneous MoE "enable_hetero_moe": self.enable_hetero_moe, "num_大_experts": self.num_大_experts, "num_小_experts": self.num_小_experts, "num_shared_experts": self.num_shared_experts, "大_intermediate_size": self.大_intermediate_size, "小_intermediate_size": self.小_intermediate_size, "shared_intermediate_size": self.shared_intermediate_size, "top_k": self.top_k, "num_experts_per_tok": self.num_experts_per_tok, # Wu-Xing "enable_wuxing": self.enable_wuxing, "enable_magic_init": self.enable_magic_init, "enable_annealing": self.enable_annealing, "alpha_start": self.alpha_start, "beta_start": self.beta_start, "gamma_start": self.gamma_start, "alpha_end": self.alpha_end, "beta_end": self.beta_end, "gamma_end": self.gamma_end, # Latent Thought Loop "enable_latent_thought": self.enable_latent_thought, "num_latents": self.num_latents, "latent_dim": self.latent_dim, "max_k": self.max_k, # RLP "enable_rlp": self.enable_rlp, "rlp_quality_weight": self.rlp_quality_weight, "rlp_info_gain_clip": self.rlp_info_gain_clip, "rlp_ntp_weight": self.rlp_ntp_weight, "rlp_target_reward_weight": self.rlp_target_reward_weight, "rlp_warmup_steps": self.rlp_warmup_steps, # Self-Evaluation "enable_self_eval": self.enable_self_eval, "self_eval_dims": self.self_eval_dims, }) return output # ======================================== # 사전 정의된 설정 (Presets) # ======================================== def get_aether_micro_config_full(): """ Full 설정 (모든 기능 활성화) 0.5B 검증 전략 준수: 1024 hidden, 16 heads """ return AETHERMicroConfig( vocab_size=64000, hidden_size=1024, intermediate_size=4096, num_hidden_layers=24, num_attention_heads=16, num_key_value_heads=4, max_position_embeddings=8192, enable_wuxing=True, enable_magic_init=True, enable_hetero_moe=True, enable_latent_thought=True, enable_rlp=True, enable_annealing=True, num_大_experts=5, num_小_experts=15, num_shared_experts=2, top_k=2, num_latents=8, latent_dim=512, max_k=2, alpha_start=0.5, beta_start=0.3, gamma_start=0.2, alpha_end=0.1, beta_end=0.2, gamma_end=0.7, ) def get_aether_micro_config_baseline(): """ Baseline 설정 (커스텀 기능 비활성화) 0.5B 검증 전략 Ablation Run 0 """ return AETHERMicroConfig( vocab_size=64000, hidden_size=1024, intermediate_size=4096, num_hidden_layers=24, num_attention_heads=16, num_key_value_heads=4, max_position_embeddings=8192, enable_wuxing=False, enable_magic_init=False, enable_hetero_moe=False, enable_latent_thought=False, enable_rlp=False, enable_annealing=False, # Baseline은 uniform experts (20개 × 1536 dim) num_大_experts=0, num_小_experts=20, num_shared_experts=2, top_k=2, ) # ======================================== # 설정 검증 함수 # ======================================== def validate_aether_micro_config(config: AETHERMicroConfig) -> bool: """ 설정 검증 Args: config: AETHERMicroConfig 인스턴스 Returns: bool: 검증 성공 여부 Raises: ValueError: 잘못된 설정 발견 시 """ # 1. Attention heads 검증 if config.hidden_size % config.num_attention_heads != 0: raise ValueError( f"hidden_size ({config.hidden_size}) must be divisible by " f"num_attention_heads ({config.num_attention_heads})" ) # 2. Wu-Xing Annealing 검증 if config.enable_annealing: if not (0 <= config.alpha_start <= 1): raise ValueError(f"alpha_start must be in [0, 1], got {config.alpha_start}") if not (0 <= config.beta_start <= 1): raise ValueError(f"beta_start must be in [0, 1], got {config.beta_start}") if not (0 <= config.gamma_start <= 1): raise ValueError(f"gamma_start must be in [0, 1], got {config.gamma_start}") # α + β + γ = 1 검증 total_start = config.alpha_start + config.beta_start + config.gamma_start if abs(total_start - 1.0) > 1e-6: raise ValueError( f"alpha_start + beta_start + gamma_start must sum to 1.0, " f"got {total_start}" ) # 3. MoE 검증 if config.enable_hetero_moe: if config.num_大_experts != 5: raise ValueError( f"Heterogeneous MoE requires num_大_experts=5 (one per Wu-Xing Agent), " f"got {config.num_大_experts}" ) if config.num_小_experts % 5 != 0: raise ValueError( f"num_小_experts must be divisible by 5 (for 5 Agents), " f"got {config.num_小_experts}" ) # 4. Latent Thought 검증 if config.enable_latent_thought: if config.latent_dim >= config.hidden_size: raise ValueError( f"latent_dim ({config.latent_dim}) must be less than " f"hidden_size ({config.hidden_size})" ) if config.max_k < 0 or config.max_k > 2: raise ValueError(f"max_k must be in [0, 2], got {config.max_k}") # 5. RLP 검증 if config.enable_rlp: if not config.enable_self_eval: raise ValueError("RLP requires enable_self_eval=True") if config.rlp_ntp_weight + config.rlp_target_reward_weight > 1.0: raise ValueError( f"rlp_ntp_weight + rlp_target_reward_weight must be <= 1.0, " f"got {config.rlp_ntp_weight + config.rlp_target_reward_weight}" ) return True if __name__ == "__main__": # 테스트 print("=" * 80) print("AETHER-Micro Configuration Test") print("=" * 80) print() # Full 설정 생성 config_full = get_aether_micro_config_full() print("✅ Full config created") print(f" Model type: {config_full.model_type}") print(f" Parameters: {config_full.vocab_size} vocab, {config_full.hidden_size} hidden") print(f" Wu-Xing: {config_full.enable_wuxing}") print(f" Hetero MoE: {config_full.enable_hetero_moe} (大{config_full.num_大_experts} + 小{config_full.num_小_experts})") print(f" Latent Thought: {config_full.enable_latent_thought}") print(f" RLP: {config_full.enable_rlp}") print() # Baseline 설정 생성 config_baseline = get_aether_micro_config_baseline() print("✅ Baseline config created") print(f" All custom features: {config_baseline.enable_wuxing}") print() # 검증 validate_aether_micro_config(config_full) print("✅ Full config validation passed") validate_aether_micro_config(config_baseline) print("✅ Baseline config validation passed") print() # 저장 테스트 config_full.save_pretrained("/tmp/aether-micro-config-test") print("✅ Config saved to /tmp/aether-micro-config-test") print() # 로드 테스트 from transformers import AutoConfig config_loaded = AutoConfig.from_pretrained("/tmp/aether-micro-config-test", trust_remote_code=True) print("✅ Config loaded from /tmp/aether-micro-config-test") print(f" Loaded model type: {config_loaded.model_type}") print() print("=" * 80) print("All tests passed!") print("=" * 80)