AETHER-Micro-0.5B / configuration_aether_micro.py
Be2Jay's picture
Upload AETHER-Micro 0.5B Phase 1 checkpoint (Step 57000)
de40e7d verified
#!/usr/bin/env python3
"""
AETHER-Micro Configuration (Hugging Face Standard)
기존 AETHER-Micro의 모든 하이퍼파라미터를 HF PretrainedConfig 형식으로 정의합니다.
Wu-Xing, Heterogeneous MoE, Latent Thought Loop, RLP 등 모든 커스텀 기능을 보존합니다.
"""
from transformers import PretrainedConfig
from typing import Optional
class AETHERMicroConfig(PretrainedConfig):
"""
AETHER-Micro 모델 설정 클래스
Hugging Face PretrainedConfig를 상속하여 AETHER-Micro의 모든 하이퍼파라미터를 정의합니다.
이 설정은 Unsloth와 완전히 호환됩니다.
Args:
vocab_size (int, optional, defaults to 64000):
어휘 크기 (64K tokenizer 기준)
hidden_size (int, optional, defaults to 896):
히든 레이어 차원 (14의 배수로 최적화)
intermediate_size (int, optional, defaults to 3584):
FFN 중간 레이어 차원 (hidden_size × 4)
num_hidden_layers (int, optional, defaults to 24):
Transformer 레이어 개수
num_attention_heads (int, optional, defaults to 14):
Multi-Head Attention 헤드 개수
num_key_value_heads (int, optional, defaults to 14):
GQA (Grouped Query Attention) K/V 헤드 개수
max_position_embeddings (int, optional, defaults to 8192):
최대 시퀀스 길이
rms_norm_eps (float, optional, defaults to 1e-6):
RMSNorm epsilon 값
rope_theta (float, optional, defaults to 10000.0):
RoPE (Rotary Position Embedding) theta 값
attention_dropout (float, optional, defaults to 0.0):
Attention 드롭아웃 비율
# ========================================
# Heterogeneous MoE Settings
# ========================================
enable_hetero_moe (bool, optional, defaults to True):
Heterogeneous MoE 활성화 여부
num_大_experts (int, optional, defaults to 5):
大型 Expert 개수 (2048 dim, 금/수/목/화/토 Agent 각 1개 제어)
num_小_experts (int, optional, defaults to 15):
小型 Expert 개수 (1024 dim, 각 Agent가 3개씩 제어)
num_shared_experts (int, optional, defaults to 2):
공유 Expert 개수 (1536 dim, 항상 활성화)
top_k (int, optional, defaults to 2):
MoE Top-K 라우팅 개수
# ========================================
# Wu-Xing Router Settings
# ========================================
enable_wuxing (bool, optional, defaults to True):
Wu-Xing (오행) 라우터 활성화 여부
enable_magic_init (bool, optional, defaults to True):
Magic Square 초기화 활성화 여부
# ========================================
# Wu-Xing Annealing Settings
# ========================================
enable_annealing (bool, optional, defaults to True):
Wu-Xing Annealing Scheduler 활성화 여부
alpha_start (float, optional, defaults to 0.5):
Magic Square 초기 가중치 (warmup phase)
beta_start (float, optional, defaults to 0.3):
Learned Matrix 초기 가중치 (warmup phase)
gamma_start (float, optional, defaults to 0.2):
History Buffer 초기 가중치 (warmup phase)
alpha_end (float, optional, defaults to 0.1):
Magic Square 최종 가중치 (mature phase)
beta_end (float, optional, defaults to 0.2):
Learned Matrix 최종 가중치 (mature phase)
gamma_end (float, optional, defaults to 0.7):
History Buffer 최종 가중치 (mature phase)
# ========================================
# Latent Thought Loop Settings
# ========================================
enable_latent_thought (bool, optional, defaults to True):
Latent Thought Loop 활성화 여부
num_latents (int, optional, defaults to 8):
Latent token 개수
latent_dim (int, optional, defaults to 512):
Latent 차원 (hidden_size보다 작음)
max_k (int, optional, defaults to 2):
최대 사고 깊이 (K=0: direct, K=1: shallow, K=2: deep)
# ========================================
# RLP (Reinforcement as Pretraining) Settings
# ========================================
enable_rlp (bool, optional, defaults to False):
RLP 훈련 활성화 여부 (기본 비활성화, 훈련 시 명시적 활성화)
rlp_quality_weight (float, optional, defaults to 1.0):
RLP Quality 가중치
rlp_info_gain_clip (float, optional, defaults to 5.0):
Information Gain 클리핑 범위
rlp_ntp_weight (float, optional, defaults to 0.7):
NTP (Next Token Prediction) 손실 가중치
rlp_target_reward_weight (float, optional, defaults to 0.3):
Target RLP 보상 가중치 (annealing으로 0 → 0.3)
rlp_warmup_steps (int, optional, defaults to 1500):
RLP 보상 가중치 warmup 스텝 (전체의 10%)
# ========================================
# Self-Evaluation Head Settings
# ========================================
enable_self_eval (bool, optional, defaults to True):
Self-Evaluation Head 활성화 여부 (RLP 훈련 시 필요)
self_eval_dims (int, optional, defaults to 4):
Quality 차원 개수 (factuality, coherence, completeness, specificity)
Example:
```python
from transformers import AutoConfig
# 기본 설정으로 생성
config = AETHERMicroConfig()
# 커스텀 설정
config = AETHERMicroConfig(
vocab_size=64000,
hidden_size=896,
num_hidden_layers=24,
enable_wuxing=True,
enable_hetero_moe=True,
enable_latent_thought=True,
enable_rlp=False
)
# 저장 및 로드
config.save_pretrained("./aether-micro-config")
config = AutoConfig.from_pretrained("./aether-micro-config")
```
"""
model_type = "aether_micro"
def __init__(
self,
# ========================================
# Base Architecture
# ========================================
vocab_size: int = 64000,
hidden_size: int = 1024, # 0.5B 검증 전략: 896 → 1024
intermediate_size: int = 4096, # hidden_size × 4
num_hidden_layers: int = 24,
num_attention_heads: int = 16, # 0.5B 검증 전략: 14 → 16
num_key_value_heads: int = 4, # GQA-4 (0.5B 검증 전략)
max_position_embeddings: int = 8192,
rms_norm_eps: float = 1e-6,
rope_theta: float = 10000.0,
attention_dropout: float = 0.0,
# ========================================
# Tokenizer & Generation
# ========================================
use_cache: bool = True,
pad_token_id: int = 0,
bos_token_id: int = 1,
eos_token_id: int = 2,
tie_word_embeddings: bool = False,
# ========================================
# Heterogeneous MoE
# ========================================
enable_hetero_moe: bool = True,
num_大_experts: int = 5,
num_小_experts: int = 15,
num_shared_experts: int = 2,
大_intermediate_size: int = 2048,
小_intermediate_size: int = 1024,
shared_intermediate_size: int = 1536,
top_k: int = 2,
num_experts_per_tok: int = 2,
# ========================================
# Wu-Xing Router
# ========================================
enable_wuxing: bool = True,
enable_magic_init: bool = True,
# ========================================
# Wu-Xing Annealing
# ========================================
enable_annealing: bool = True,
alpha_start: float = 0.5,
beta_start: float = 0.3,
gamma_start: float = 0.2,
alpha_end: float = 0.1,
beta_end: float = 0.2,
gamma_end: float = 0.7,
# ========================================
# Latent Thought Loop
# ========================================
enable_latent_thought: bool = True,
num_latents: int = 8,
latent_dim: int = 512,
max_k: int = 2,
# ========================================
# RLP (Reinforcement as Pretraining)
# ========================================
enable_rlp: bool = False,
rlp_quality_weight: float = 1.0,
rlp_info_gain_clip: float = 5.0,
rlp_ntp_weight: float = 0.7,
rlp_target_reward_weight: float = 0.3,
rlp_warmup_steps: int = 1500,
# ========================================
# Quality Head (Block 3)
# ========================================
enable_quality_head: bool = True,
quality_head_dim: int = 4, # Coherence, Relevance, Specificity, Helpfulness
# ========================================
# Self-Evaluation Head
# ========================================
enable_self_eval: bool = True,
self_eval_dims: int = 4,
# ========================================
# MTP Loss (Multi-Token Prediction) (Block 5)
# ========================================
enable_mtp_loss: bool = True,
mtp_num_predictions: int = 4, # Predict next 4 tokens
# Magic Square Data Scheduler (Block 6)
# ========================================
enable_magic_square: bool = True,
**kwargs
):
# Base architecture
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.max_position_embeddings = max_position_embeddings
self.rms_norm_eps = rms_norm_eps
self.rope_theta = rope_theta
self.attention_dropout = attention_dropout
# Tokenizer & Generation
self.use_cache = use_cache
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.tie_word_embeddings = tie_word_embeddings
# Heterogeneous MoE
self.enable_hetero_moe = enable_hetero_moe
self.num_大_experts = num_大_experts
self.num_小_experts = num_小_experts
self.num_shared_experts = num_shared_experts
self.大_intermediate_size = 大_intermediate_size
self.小_intermediate_size = 小_intermediate_size
self.shared_intermediate_size = shared_intermediate_size
self.top_k = top_k
self.num_experts_per_tok = num_experts_per_tok
# Wu-Xing Router
self.enable_wuxing = enable_wuxing
self.enable_magic_init = enable_magic_init
# Wu-Xing Annealing
self.enable_annealing = enable_annealing
self.alpha_start = alpha_start
self.beta_start = beta_start
self.gamma_start = gamma_start
self.alpha_end = alpha_end
self.beta_end = beta_end
self.gamma_end = gamma_end
# Latent Thought Loop
self.enable_latent_thought = enable_latent_thought
self.num_latents = num_latents
self.latent_dim = latent_dim
self.max_k = max_k
# RLP
self.enable_rlp = enable_rlp
self.rlp_quality_weight = rlp_quality_weight
self.rlp_info_gain_clip = rlp_info_gain_clip
self.rlp_ntp_weight = rlp_ntp_weight
self.rlp_target_reward_weight = rlp_target_reward_weight
self.rlp_warmup_steps = rlp_warmup_steps
# Quality Head (Block 3)
self.enable_quality_head = enable_quality_head
self.quality_head_dim = quality_head_dim
# Self-Evaluation Head
self.enable_self_eval = enable_self_eval
self.self_eval_dims = self_eval_dims
# MTP Loss (Block 5)
self.enable_mtp_loss = enable_mtp_loss
self.mtp_num_predictions = mtp_num_predictions
# Magic Square Data Scheduler (Block 6)
self.enable_magic_square = enable_magic_square
# Remove our custom params from kwargs before calling parent
kwargs_copy = kwargs.copy()
for key in ['enable_hetero_moe', 'num_大_experts', 'num_小_experts', 'num_shared_experts',
'大_intermediate_size', '小_intermediate_size', 'shared_intermediate_size',
'top_k', 'num_experts_per_tok', 'enable_wuxing', 'enable_magic_init',
'enable_annealing', 'alpha_start', 'beta_start', 'gamma_start',
'alpha_end', 'beta_end', 'gamma_end', 'enable_latent_thought',
'num_latents', 'latent_dim', 'max_k', 'enable_rlp', 'rlp_quality_weight',
'enable_quality_head', 'quality_head_dim',
'rlp_info_gain_clip', 'rlp_ntp_weight', 'rlp_target_reward_weight',
'rlp_warmup_steps', 'enable_self_eval', 'self_eval_dims',
'enable_mtp_loss', 'mtp_num_predictions', 'enable_magic_square']:
kwargs_copy.pop(key, None)
# Call parent constructor
super().__init__(**kwargs_copy)
# Re-assign our custom params after parent init (to prevent override)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.max_position_embeddings = max_position_embeddings
self.rms_norm_eps = rms_norm_eps
self.rope_theta = rope_theta
self.attention_dropout = attention_dropout
self.use_cache = use_cache
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
self.tie_word_embeddings = tie_word_embeddings
self.enable_hetero_moe = enable_hetero_moe
self.num_大_experts = num_大_experts
self.num_小_experts = num_小_experts
self.num_shared_experts = num_shared_experts
self.大_intermediate_size = 大_intermediate_size
self.小_intermediate_size = 小_intermediate_size
self.shared_intermediate_size = shared_intermediate_size
self.top_k = top_k
self.num_experts_per_tok = num_experts_per_tok
self.enable_wuxing = enable_wuxing
self.enable_magic_init = enable_magic_init
self.enable_annealing = enable_annealing
self.alpha_start = alpha_start
self.beta_start = beta_start
self.gamma_start = gamma_start
self.alpha_end = alpha_end
self.beta_end = beta_end
self.gamma_end = gamma_end
self.enable_latent_thought = enable_latent_thought
self.num_latents = num_latents
self.latent_dim = latent_dim
self.max_k = max_k
self.enable_rlp = enable_rlp
self.rlp_quality_weight = rlp_quality_weight
self.rlp_info_gain_clip = rlp_info_gain_clip
self.rlp_ntp_weight = rlp_ntp_weight
self.rlp_target_reward_weight = rlp_target_reward_weight
self.rlp_warmup_steps = rlp_warmup_steps
self.enable_self_eval = enable_self_eval
self.self_eval_dims = self_eval_dims
def to_dict(self):
"""
설정을 dict로 변환 (저장 시 사용)
"""
output = super().to_dict()
# 모든 AETHER-Micro 파라미터 추가
output.update({
# Base
"vocab_size": self.vocab_size,
"hidden_size": self.hidden_size,
"intermediate_size": self.intermediate_size,
"num_hidden_layers": self.num_hidden_layers,
"num_attention_heads": self.num_attention_heads,
"num_key_value_heads": self.num_key_value_heads,
"max_position_embeddings": self.max_position_embeddings,
"rms_norm_eps": self.rms_norm_eps,
"rope_theta": self.rope_theta,
"attention_dropout": self.attention_dropout,
# Tokenizer & Generation
"use_cache": self.use_cache,
"pad_token_id": self.pad_token_id,
"bos_token_id": self.bos_token_id,
"eos_token_id": self.eos_token_id,
"tie_word_embeddings": self.tie_word_embeddings,
# Heterogeneous MoE
"enable_hetero_moe": self.enable_hetero_moe,
"num_大_experts": self.num_大_experts,
"num_小_experts": self.num_小_experts,
"num_shared_experts": self.num_shared_experts,
"大_intermediate_size": self.大_intermediate_size,
"小_intermediate_size": self.小_intermediate_size,
"shared_intermediate_size": self.shared_intermediate_size,
"top_k": self.top_k,
"num_experts_per_tok": self.num_experts_per_tok,
# Wu-Xing
"enable_wuxing": self.enable_wuxing,
"enable_magic_init": self.enable_magic_init,
"enable_annealing": self.enable_annealing,
"alpha_start": self.alpha_start,
"beta_start": self.beta_start,
"gamma_start": self.gamma_start,
"alpha_end": self.alpha_end,
"beta_end": self.beta_end,
"gamma_end": self.gamma_end,
# Latent Thought Loop
"enable_latent_thought": self.enable_latent_thought,
"num_latents": self.num_latents,
"latent_dim": self.latent_dim,
"max_k": self.max_k,
# RLP
"enable_rlp": self.enable_rlp,
"rlp_quality_weight": self.rlp_quality_weight,
"rlp_info_gain_clip": self.rlp_info_gain_clip,
"rlp_ntp_weight": self.rlp_ntp_weight,
"rlp_target_reward_weight": self.rlp_target_reward_weight,
"rlp_warmup_steps": self.rlp_warmup_steps,
# Self-Evaluation
"enable_self_eval": self.enable_self_eval,
"self_eval_dims": self.self_eval_dims,
})
return output
# ========================================
# 사전 정의된 설정 (Presets)
# ========================================
def get_aether_micro_config_full():
"""
Full 설정 (모든 기능 활성화)
0.5B 검증 전략 준수: 1024 hidden, 16 heads
"""
return AETHERMicroConfig(
vocab_size=64000,
hidden_size=1024,
intermediate_size=4096,
num_hidden_layers=24,
num_attention_heads=16,
num_key_value_heads=4,
max_position_embeddings=8192,
enable_wuxing=True,
enable_magic_init=True,
enable_hetero_moe=True,
enable_latent_thought=True,
enable_rlp=True,
enable_annealing=True,
num_大_experts=5,
num_小_experts=15,
num_shared_experts=2,
top_k=2,
num_latents=8,
latent_dim=512,
max_k=2,
alpha_start=0.5,
beta_start=0.3,
gamma_start=0.2,
alpha_end=0.1,
beta_end=0.2,
gamma_end=0.7,
)
def get_aether_micro_config_baseline():
"""
Baseline 설정 (커스텀 기능 비활성화)
0.5B 검증 전략 Ablation Run 0
"""
return AETHERMicroConfig(
vocab_size=64000,
hidden_size=1024,
intermediate_size=4096,
num_hidden_layers=24,
num_attention_heads=16,
num_key_value_heads=4,
max_position_embeddings=8192,
enable_wuxing=False,
enable_magic_init=False,
enable_hetero_moe=False,
enable_latent_thought=False,
enable_rlp=False,
enable_annealing=False,
# Baseline은 uniform experts (20개 × 1536 dim)
num_大_experts=0,
num_小_experts=20,
num_shared_experts=2,
top_k=2,
)
# ========================================
# 설정 검증 함수
# ========================================
def validate_aether_micro_config(config: AETHERMicroConfig) -> bool:
"""
설정 검증
Args:
config: AETHERMicroConfig 인스턴스
Returns:
bool: 검증 성공 여부
Raises:
ValueError: 잘못된 설정 발견 시
"""
# 1. Attention heads 검증
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
f"hidden_size ({config.hidden_size}) must be divisible by "
f"num_attention_heads ({config.num_attention_heads})"
)
# 2. Wu-Xing Annealing 검증
if config.enable_annealing:
if not (0 <= config.alpha_start <= 1):
raise ValueError(f"alpha_start must be in [0, 1], got {config.alpha_start}")
if not (0 <= config.beta_start <= 1):
raise ValueError(f"beta_start must be in [0, 1], got {config.beta_start}")
if not (0 <= config.gamma_start <= 1):
raise ValueError(f"gamma_start must be in [0, 1], got {config.gamma_start}")
# α + β + γ = 1 검증
total_start = config.alpha_start + config.beta_start + config.gamma_start
if abs(total_start - 1.0) > 1e-6:
raise ValueError(
f"alpha_start + beta_start + gamma_start must sum to 1.0, "
f"got {total_start}"
)
# 3. MoE 검증
if config.enable_hetero_moe:
if config.num_大_experts != 5:
raise ValueError(
f"Heterogeneous MoE requires num_大_experts=5 (one per Wu-Xing Agent), "
f"got {config.num_大_experts}"
)
if config.num_小_experts % 5 != 0:
raise ValueError(
f"num_小_experts must be divisible by 5 (for 5 Agents), "
f"got {config.num_小_experts}"
)
# 4. Latent Thought 검증
if config.enable_latent_thought:
if config.latent_dim >= config.hidden_size:
raise ValueError(
f"latent_dim ({config.latent_dim}) must be less than "
f"hidden_size ({config.hidden_size})"
)
if config.max_k < 0 or config.max_k > 2:
raise ValueError(f"max_k must be in [0, 2], got {config.max_k}")
# 5. RLP 검증
if config.enable_rlp:
if not config.enable_self_eval:
raise ValueError("RLP requires enable_self_eval=True")
if config.rlp_ntp_weight + config.rlp_target_reward_weight > 1.0:
raise ValueError(
f"rlp_ntp_weight + rlp_target_reward_weight must be <= 1.0, "
f"got {config.rlp_ntp_weight + config.rlp_target_reward_weight}"
)
return True
if __name__ == "__main__":
# 테스트
print("=" * 80)
print("AETHER-Micro Configuration Test")
print("=" * 80)
print()
# Full 설정 생성
config_full = get_aether_micro_config_full()
print("✅ Full config created")
print(f" Model type: {config_full.model_type}")
print(f" Parameters: {config_full.vocab_size} vocab, {config_full.hidden_size} hidden")
print(f" Wu-Xing: {config_full.enable_wuxing}")
print(f" Hetero MoE: {config_full.enable_hetero_moe} (大{config_full.num_大_experts} + 小{config_full.num_小_experts})")
print(f" Latent Thought: {config_full.enable_latent_thought}")
print(f" RLP: {config_full.enable_rlp}")
print()
# Baseline 설정 생성
config_baseline = get_aether_micro_config_baseline()
print("✅ Baseline config created")
print(f" All custom features: {config_baseline.enable_wuxing}")
print()
# 검증
validate_aether_micro_config(config_full)
print("✅ Full config validation passed")
validate_aether_micro_config(config_baseline)
print("✅ Baseline config validation passed")
print()
# 저장 테스트
config_full.save_pretrained("/tmp/aether-micro-config-test")
print("✅ Config saved to /tmp/aether-micro-config-test")
print()
# 로드 테스트
from transformers import AutoConfig
config_loaded = AutoConfig.from_pretrained("/tmp/aether-micro-config-test", trust_remote_code=True)
print("✅ Config loaded from /tmp/aether-micro-config-test")
print(f" Loaded model type: {config_loaded.model_type}")
print()
print("=" * 80)
print("All tests passed!")
print("=" * 80)