Kiy-K
/

KiyEngine-V3

@@ -1,185 +1,194 @@
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel
-from transformers.modeling_outputs import BaseModelOutput
 from .configuration_kiyengine import KiyEngineConfig
 class MambaBlock(nn.Module):
-    """Mamba SSM Block"""
-    def __init__(self, d_model, d_state, d_conv, expansion_factor):
         super().__init__()
-        self.d_model = d_model
-        self.d_state = d_state
-        self.d_conv = d_conv
-        self.expansion = expansion_factor
-        # Simplified Mamba components
-        self.in_proj = nn.Linear(d_model, d_model * expansion_factor * 2)
         self.conv1d = nn.Conv1d(
-            d_model * expansion_factor,
-            d_model * expansion_factor,
-            kernel_size=d_conv,
-            padding=d_conv - 1,
-            groups=d_model * expansion_factor
         )
-        self.x_proj = nn.Linear(d_model * expansion_factor, d_state * 2)
-        self.dt_proj = nn.Linear(d_model * expansion_factor, d_model)
-        self.out_proj = nn.Linear(d_model * expansion_factor, d_model)
-    def forward(self, x):
-        # Simplified forward pass
-        b, l, d = x.shape
-        # Input projection
-        x_and_res = self.in_proj(x)
-        x, res = x_and_res.split(self.d_model * self.expansion, dim=-1)
-        # Conv1d
-        x = x.transpose(1, 2)  # (B, D, L)
-        x = self.conv1d(x)[:, :, :l]
-        x = x.transpose(1, 2)  # (B, L, D)
-        # SSM
-        x = nn.functional.silu(x)
-        # Output projection
-        y = self.out_proj(x * nn.functional.silu(res))
-        return y
 class MoELayer(nn.Module):
-    """Mixture of Experts Layer (Fixed with Safe Routing)"""
-    def __init__(self, d_model, n_experts, top_k):
         super().__init__()
-        self.n_experts = n_experts
-        self.top_k = top_k
-        # Router
-        self.gate = nn.Linear(d_model, n_experts)
-        # Experts
-        self.experts = nn.ModuleList([
-            nn.Sequential(
-                nn.Linear(d_model, d_model * 4),
-                nn.GELU(),
-                nn.Linear(d_model * 4, d_model)
-            )
-            for _ in range(n_experts)
-        ])
-    def forward(self, x):
-        b, l, d = x.shape
-        # Flatten for routing
-        x_flat = x.view(-1, d)
-        # Route to experts
-        router_logits = self.gate(x_flat)
-        router_probs = nn.functional.softmax(router_logits, dim=-1)
-        # --- FIX: SAFE ROUTING LOGIC ---
-        # Kiểm tra số lượng experts thực tế trong tensor
-        num_available_experts = router_probs.size(-1)
-        # Lấy min để đảm bảo k không bao giờ lớn hơn số expert hiện có
-        k_safe = min(self.top_k, num_available_experts)
-        # Select top-k experts using k_safe
-        top_k_probs, top_k_indices = torch.topk(router_probs, k_safe, dim=-1)
-        # Normalize probabilities
-        top_k_probs = top_k_probs / (top_k_probs.sum(dim=-1, keepdim=True) + 1e-9) # Thêm epsilon tránh chia cho 0
-        # Combine expert outputs
-        expert_outputs = torch.zeros_like(x_flat)
-        # Loop qua k_safe thay vì self.top_k
         for i in range(k_safe):
             expert_idx = top_k_indices[:, i]
-            expert_prob = top_k_probs[:, i:i+1]
-            for expert_id in range(self.n_experts):
-                mask = (expert_idx == expert_id)
                 if mask.any():
-                    expert_input = x_flat[mask]
-                    expert_output = self.experts[expert_id](expert_input)
-                    expert_outputs[mask] += expert_prob[mask] * expert_output
-        return expert_outputs.view(b, l, d)
-class KiyEngineMambaBlock(nn.Module):
-    """Combined Mamba + MoE Block"""
-    def __init__(self, config):
-        super().__init__()
-        self.mamba = MambaBlock(
-            config.d_model,
-            config.d_state,
-            config.d_conv,
-            config.expansion_factor
-        )
-        self.moe = MoELayer(config.d_model, config.n_experts, config.top_k)
-        self.norm1 = nn.LayerNorm(config.d_model)
-        self.norm2 = nn.LayerNorm(config.d_model)
-    def forward(self, x):
-        # Mamba branch
-        x = x + self.mamba(self.norm1(x))
-        # MoE branch
-        x = x + self.moe(self.norm2(x))
-        return x
 class KiyEngineModel(PreTrainedModel):
     """
-    KiyEngine V3: Mamba-MoE Chess Evaluation Model
     """
     config_class = KiyEngineConfig
     def __init__(self, config):
         super().__init__(config)
         self.config = config
-        # Embedding layer
-        self.embeddings = nn.Embedding(config.vocab_size, config.d_model)
-        # Mamba-MoE blocks
-        self.layers = nn.ModuleList([
-            KiyEngineMambaBlock(config)
-            for _ in range(config.n_layers)
-        ])
-        # Final layer norm
-        self.norm = nn.LayerNorm(config.d_model)
         # Initialize weights
         self.post_init()
     def forward(
         self,
-        input_ids=None,
-        attention_mask=None,
-        return_dict=None,
         **kwargs
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # Embed input
-        hidden_states = self.embeddings(input_ids)
-        # Pass through layers
         for layer in self.layers:
-            hidden_states = layer(hidden_states)
-        # Final norm
-        hidden_states = self.norm(hidden_states)
-        if not return_dict:
-            return (hidden_states,)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=None,
-            attentions=None,
         )

+"""
+KiyEngine V3: Mamba-MoE Chess Model
+Matched exactly with standalone_train.py structure for 100% weight compatibility.
+"""
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from transformers import PreTrainedModel
+from transformers.modeling_outputs import ModelOutput
+from dataclasses import dataclass
+from typing import Optional, Tuple
 from .configuration_kiyengine import KiyEngineConfig
+# === Helper Classes (Copied & Adapted from Training Script) ===
+class GaussianNoise(nn.Module):
+    def __init__(self, sigma: float = 0.01):
+        super().__init__()
+        self.sigma = sigma
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Trong Inference, ta luôn tắt Noise (sigma=0 hoặc mode eval)
+        if self.training and self.sigma != 0:
+            return x + torch.randn_like(x) * self.sigma
+        return x
+class RMSNorm(nn.Module):
+    def __init__(self, d_model: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(d_model))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.norm(2, dim=-1, keepdim=True) * (x.shape[-1] ** -0.5)
+        return x / (norm + self.eps) * self.weight
 class MambaBlock(nn.Module):
+    def __init__(self, config):
         super().__init__()
+        # Lấy tham số từ config object
+        d_model = config.d_model
+        d_state = config.d_state
+        d_conv = config.d_conv
+        exp_factor = config.expansion_factor
+        d_inner = d_model * exp_factor
+        # Định nghĩa y hệt training script để khớp keys
+        self.in_proj = nn.Linear(d_model, 2 * d_inner, bias=False)
         self.conv1d = nn.Conv1d(
+            in_channels=d_inner,
+            out_channels=d_inner,
+            kernel_size=d_conv,
+            bias=True,
+            groups=d_inner,
+            padding=d_conv - 1
         )
+        self.x_proj = nn.Linear(d_inner, d_inner + 2 * d_state, bias=False)
+        self.dt_proj = nn.Linear(d_inner, d_inner, bias=True)
+        self.A_log = nn.Parameter(torch.randn(d_inner, d_state))
+        self.D = nn.Parameter(torch.ones(d_inner))
+        self.out_proj = nn.Linear(d_inner, d_model, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Logic forward khớp với training script
+        # Lưu ý: Script training của sếp dùng mô hình simplified (Gated CNN)
+        # nên ta phải follow đúng logic đó để ra kết quả đúng.
+        _, L, C = x.shape
+        xz = self.in_proj(x)
+        x_inner, z = xz.chunk(2, dim=-1)
+        # Conv1d expects (B, C, L)
+        x_conv = self.conv1d(x_inner.transpose(1, 2))[:, :, :L].transpose(1, 2)
+        x_activated = F.silu(x_conv)
+        # Element-wise gating with D
+        y = x_activated * self.D.unsqueeze(0)
+        y = y * F.silu(z)
+        return self.out_proj(y)
 class MoELayer(nn.Module):
+    def __init__(self, config):
         super().__init__()
+        self.n_experts = config.n_experts
+        self.top_k = config.top_k
+        self.router = nn.Linear(config.d_model, self.n_experts)
+        self.experts = nn.ModuleList([MambaBlock(config) for _ in range(self.n_experts)])
+    def forward(self, x: torch.Tensor):
+        B, L, C = x.shape
+        x_flat = x.view(-1, C)
+        router_logits = self.router(x_flat)
+        router_probs = F.softmax(router_logits, dim=1)
+        # --- SAFE ROUTING FIX ---
+        # Giữ lại fix này để tránh crash nếu config lệch
+        num_available = router_probs.size(-1)
+        k_safe = min(self.top_k, num_available)
+        top_k_weights, top_k_indices = torch.topk(router_probs, k_safe, dim=-1)
+        top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)
+        final_output = torch.zeros_like(x_flat)
         for i in range(k_safe):
             expert_idx = top_k_indices[:, i]
+            weight = top_k_weights[:, i].unsqueeze(-1)
+            for j in range(self.n_experts):
+                mask = expert_idx == j
                 if mask.any():
+                    # Logic: Input (N, D) -> Unsqueeze(1) -> (N, 1, D) -> Expert -> Squeeze(1)
+                    inp = x_flat[mask].unsqueeze(1)
+                    out = self.experts[j](inp).squeeze(1)
+                    final_output[mask] += out * weight[mask]
+        return final_output.view(B, L, C)
+# === Output Class for Hugging Face ===
+@dataclass
+class KiyEngineOutput(ModelOutput):
+    loss: Optional[torch.Tensor] = None
+    policy_logits: Optional[torch.Tensor] = None
+    value: Optional[torch.Tensor] = None
+    last_hidden_state: Optional[torch.Tensor] = None
+# === Main Model Class ===
 class KiyEngineModel(PreTrainedModel):
     """
+    KiyEngine V3: Matches exactly the structure of 'standalone_train.py'
     """
     config_class = KiyEngineConfig
     def __init__(self, config):
         super().__init__(config)
         self.config = config
+        # --- MATCHING KEYS WITH TRAIN SCRIPT ---
+        # Train script: self.embedding (NOT embeddings)
+        self.embedding = nn.Embedding(config.vocab_size, config.d_model)
+        self.noise = GaussianNoise(sigma=0.0) # Inference mode
+        # Train script: self.layers = ModuleList of MoELayer
+        self.layers = nn.ModuleList([MoELayer(config) for _ in range(config.n_layers)])
+        self.norm = RMSNorm(config.d_model)
+        # Train script has heads built-in
+        self.policy_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        self.value_head = nn.Sequential(
+            nn.Linear(config.d_model, 128),
+            nn.ReLU(),
+            nn.Linear(128, 1)
+        )
         # Initialize weights
         self.post_init()
     def forward(
         self,
+        input_ids: torch.Tensor,
+        return_dict: Optional[bool] = None,
         **kwargs
     ):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Forward pass matching training logic
+        x = self.noise(self.embedding(input_ids))
         for layer in self.layers:
+            # Training script logic: x = x + layer(norm(x))[0]
+            # Our MoELayer returns just the tensor (we dropped aux_loss return for inference clean-up)
+            x = x + layer(self.norm(x))
+        x = self.norm(x)
+        # Last token logic
+        last_token_state = x[:, -1, :]
+        policy_logits = self.policy_head(last_token_state)
+        value = torch.tanh(self.value_head(last_token_state))
+        if not return_dict:
+            return (policy_logits, value, x)
+        return KiyEngineOutput(
+            policy_logits=policy_logits,
+            value=value,
+            last_hidden_state=x
         )