""" AETHER-Net Configuration Adaptive Elemental Transformer-Hybrid Efficient Recurrent Network 5×5 Latin Orthogonal Magic Square Layout + Oheng(五行) MoE Routing """ from dataclasses import dataclass, field from typing import List, Tuple # ── 5×5 Latin Orthogonal Magic Square ── # Each row (element group) and each column (phase) contains # exactly one of each attention type → zero carry-over bias. MAGIC_SQUARE = [ # Phase1 Phase2 Phase3 Phase4 Phase5 ["gdn", "full", "mamba2", "slide", "cross"], # 木 Wood ["slide", "gdn", "full", "cross", "mamba2"], # 火 Fire ["full", "cross", "slide", "mamba2", "gdn"], # 土 Earth ["mamba2", "slide", "cross", "gdn", "full"], # 金 Metal ["cross", "mamba2", "gdn", "full", "slide"], # 水 Water ] # Flatten to 25-layer sequence (row-major) LAYER_TYPES = [t for row in MAGIC_SQUARE for t in row] # ── Oheng (五行) Element System ── ELEMENTS = ["wood", "fire", "earth", "metal", "water"] # 상생 (Generate): 木→火→土→金→水→木 GENERATE = {"wood": "fire", "fire": "earth", "earth": "metal", "metal": "water", "water": "wood"} GENERATE_REVERSE = {v: k for k, v in GENERATE.items()} # 상극 (Overcome): 木⊣土, 土⊣水, 水⊣火, 火⊣金, 金⊣木 OVERCOME = {"wood": "earth", "earth": "water", "water": "fire", "fire": "metal", "metal": "wood"} OVERCOME_REVERSE = {v: k for k, v in OVERCOME.items()} # Element → Layer indices (0-based) ELEMENT_LAYERS = { "wood": [0, 1, 2, 3, 4], "fire": [5, 6, 7, 8, 9], "earth": [10, 11, 12, 13, 14], "metal": [15, 16, 17, 18, 19], "water": [20, 21, 22, 23, 24], } # Element → Expert indices (0-based, 5 experts per element) ELEMENT_EXPERTS = { "wood": [0, 1, 2, 3, 4], "fire": [5, 6, 7, 8, 9], "earth": [10, 11, 12, 13, 14], "metal": [15, 16, 17, 18, 19], "water": [20, 21, 22, 23, 24], } # Layer index → element name LAYER_TO_ELEMENT = {} for elem, indices in ELEMENT_LAYERS.items(): for idx in indices: LAYER_TO_ELEMENT[idx] = elem @dataclass class AetherNetConfig: """Configuration for AETHER-Net model.""" # ── Model dimensions ── hidden_size: int = 4096 intermediate_size: int = 11008 # FFN intermediate (SwiGLU) num_layers: int = 25 num_attention_heads: int = 32 num_kv_heads: int = 8 # GQA for Full Attention layers head_dim: int = 128 # hidden_size // num_attention_heads vocab_size: int = 151936 # Qwen tokenizer max_position_embeddings: int = 262144 rope_theta: float = 10000000.0 # ── Layer schedule (from magic square) ── layer_types: List[str] = field(default_factory=lambda: LAYER_TYPES) # ── MoE Configuration ── num_experts: int = 25 num_experts_per_group: int = 5 num_element_groups: int = 5 top_k: int = 2 num_shared_experts: int = 1 expert_intermediate_size: int = 2752 # intermediate_size // 4 (per expert) moe_jitter_eps: float = 0.01 # ── Oheng (五行) routing ── use_generate_boost: bool = True use_overcome_gate: bool = True generate_alpha_init: float = 0.1 # learnable soft scalar overcome_gate_hidden: int = 256 # critic head hidden dim # ── Attention-specific ── sliding_window_size: int = 4096 gdn_state_size: int = 128 # Gated DeltaNet state dimension mamba2_state_size: int = 128 mamba2_conv_size: int = 4 mamba2_expand: int = 2 # ── Training / Inference ── rms_norm_eps: float = 1e-6 initializer_range: float = 0.02 tie_word_embeddings: bool = False use_cache: bool = True torch_dtype: str = "bfloat16" # ── Donor transplant info (metadata) ── primary_donor: str = "Qwen/Qwen3.5-27B" secondary_donor: str = "meta-llama/Llama-3.1-8B" def get_layer_type(self, layer_idx: int) -> str: return self.layer_types[layer_idx] def get_layer_element(self, layer_idx: int) -> str: return LAYER_TO_ELEMENT[layer_idx] def get_element_expert_range(self, element: str) -> Tuple[int, int]: indices = ELEMENT_EXPERTS[element] return (indices[0], indices[-1] + 1) def summary(self) -> str: type_counts = {} for t in self.layer_types: type_counts[t] = type_counts.get(t, 0) + 1 total_params_b = ( self.num_experts * self.expert_intermediate_size * self.hidden_size * 3 * 2 # experts + self.num_layers * self.hidden_size * self.hidden_size * 4 # attention projections + self.vocab_size * self.hidden_size * 2 # embeddings ) / 1e9 active_params_b = total_params_b * (self.top_k + self.num_shared_experts) / self.num_experts_per_group lines = [ "═" * 60, " AETHER-Net Architecture Summary", "═" * 60, f" Layers: {self.num_layers} (5×5 magic square)", f" Hidden dim: {self.hidden_size}", f" Attention mix: {type_counts}", f" MoE: {self.num_experts} experts / {self.num_element_groups} groups / top-{self.top_k}", f" Est. total: ~{total_params_b:.1f}B params", f" Est. active: ~{active_params_b:.1f}B params", f" Context: {self.max_position_embeddings:,} tokens", f" Oheng generate: {self.use_generate_boost} (α={self.generate_alpha_init})", f" Oheng overcome: {self.use_overcome_gate}", f" Primary donor: {self.primary_donor}", f" Secondary donor:{self.secondary_donor}", "═" * 60, ] return "\n".join(lines)