KiyEngine-V3 / modeling_kiyengine.py

Update modeling_kiyengine.py

c335116 verified 15 days ago

6.93 kB

	"""
	KiyEngine V3: Mamba-MoE Chess Model
	Matched exactly with standalone_train.py structure for 100% weight compatibility.
	"""
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import PreTrainedModel
	from transformers.modeling_outputs import ModelOutput
	from dataclasses import dataclass
	from typing import Optional, Tuple

	from .configuration_kiyengine import KiyEngineConfig

	# === Helper Classes (Copied & Adapted from Training Script) ===

	class GaussianNoise(nn.Module):
	def __init__(self, sigma: float = 0.01):
	super().__init__()
	self.sigma = sigma

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	# Trong Inference, ta luôn tắt Noise (sigma=0 hoặc mode eval)
	if self.training and self.sigma != 0:
	return x + torch.randn_like(x) * self.sigma
	return x

	class RMSNorm(nn.Module):
	def __init__(self, d_model: int, eps: float = 1e-5):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(d_model))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	norm = x.norm(2, dim=-1, keepdim=True) * (x.shape[-1] ** -0.5)
	return x / (norm + self.eps) * self.weight

	class MambaBlock(nn.Module):
	def __init__(self, config):
	super().__init__()
	# Lấy tham số từ config object
	d_model = config.d_model
	d_state = config.d_state
	d_conv = config.d_conv
	exp_factor = config.expansion_factor

	d_inner = d_model * exp_factor

	# Định nghĩa y hệt training script để khớp keys
	self.in_proj = nn.Linear(d_model, 2 * d_inner, bias=False)
	self.conv1d = nn.Conv1d(
	in_channels=d_inner,
	out_channels=d_inner,
	kernel_size=d_conv,
	bias=True,
	groups=d_inner,
	padding=d_conv - 1
	)
	self.x_proj = nn.Linear(d_inner, d_inner + 2 * d_state, bias=False)
	self.dt_proj = nn.Linear(d_inner, d_inner, bias=True)
	self.A_log = nn.Parameter(torch.randn(d_inner, d_state))
	self.D = nn.Parameter(torch.ones(d_inner))
	self.out_proj = nn.Linear(d_inner, d_model, bias=False)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	# Logic forward khớp với training script
	# Lưu ý: Script training của sếp dùng mô hình simplified (Gated CNN)
	# nên ta phải follow đúng logic đó để ra kết quả đúng.
	_, L, C = x.shape
	xz = self.in_proj(x)
	x_inner, z = xz.chunk(2, dim=-1)

	# Conv1d expects (B, C, L)
	x_conv = self.conv1d(x_inner.transpose(1, 2))[:, :, :L].transpose(1, 2)
	x_activated = F.silu(x_conv)

	# Element-wise gating with D
	y = x_activated * self.D.unsqueeze(0)
	y = y * F.silu(z)

	return self.out_proj(y)

	class MoELayer(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.n_experts = config.n_experts
	self.top_k = config.top_k

	self.router = nn.Linear(config.d_model, self.n_experts)
	self.experts = nn.ModuleList([MambaBlock(config) for _ in range(self.n_experts)])

	def forward(self, x: torch.Tensor):
	B, L, C = x.shape
	x_flat = x.view(-1, C)
	router_logits = self.router(x_flat)
	router_probs = F.softmax(router_logits, dim=1)

	# --- SAFE ROUTING FIX ---
	# Giữ lại fix này để tránh crash nếu config lệch
	num_available = router_probs.size(-1)
	k_safe = min(self.top_k, num_available)

	top_k_weights, top_k_indices = torch.topk(router_probs, k_safe, dim=-1)
	top_k_weights = top_k_weights / (top_k_weights.sum(dim=-1, keepdim=True) + 1e-9)

	final_output = torch.zeros_like(x_flat)

	for i in range(k_safe):
	expert_idx = top_k_indices[:, i]
	weight = top_k_weights[:, i].unsqueeze(-1)

	for j in range(self.n_experts):
	mask = expert_idx == j
	if mask.any():
	# Logic: Input (N, D) -> Unsqueeze(1) -> (N, 1, D) -> Expert -> Squeeze(1)
	inp = x_flat[mask].unsqueeze(1)
	out = self.experts[j](inp).squeeze(1)
	final_output[mask] += out * weight[mask]

	return final_output.view(B, L, C)

	# === Output Class for Hugging Face ===
	@dataclass
	class KiyEngineOutput(ModelOutput):
	loss: Optional[torch.Tensor] = None
	policy_logits: Optional[torch.Tensor] = None
	value: Optional[torch.Tensor] = None
	last_hidden_state: Optional[torch.Tensor] = None

	# === Main Model Class ===

	class KiyEngineModel(PreTrainedModel):
	"""
	KiyEngine V3: Matches exactly the structure of 'standalone_train.py'
	"""
	config_class = KiyEngineConfig

	def __init__(self, config):
	super().__init__(config)
	self.config = config

	# --- MATCHING KEYS WITH TRAIN SCRIPT ---
	# Train script: self.embedding (NOT embeddings)
	self.embedding = nn.Embedding(config.vocab_size, config.d_model)
	self.noise = GaussianNoise(sigma=0.0) # Inference mode

	# Train script: self.layers = ModuleList of MoELayer
	self.layers = nn.ModuleList([MoELayer(config) for _ in range(config.n_layers)])

	self.norm = RMSNorm(config.d_model)

	# Train script has heads built-in
	self.policy_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
	self.value_head = nn.Sequential(
	nn.Linear(config.d_model, 128),
	nn.ReLU(),
	nn.Linear(128, 1)
	)

	# Initialize weights
	self.post_init()

	def forward(
	self,
	input_ids: torch.Tensor,
	return_dict: Optional[bool] = None,
	**kwargs
	):
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	# Forward pass matching training logic
	x = self.noise(self.embedding(input_ids))

	for layer in self.layers:
	# Training script logic: x = x + layer(norm(x))[0]
	# Our MoELayer returns just the tensor (we dropped aux_loss return for inference clean-up)
	x = x + layer(self.norm(x))

	x = self.norm(x)

	# Last token logic
	last_token_state = x[:, -1, :]

	policy_logits = self.policy_head(last_token_state)
	value = torch.tanh(self.value_head(last_token_state))

	if not return_dict:
	return (policy_logits, value, x)

	return KiyEngineOutput(
	policy_logits=policy_logits,
	value=value,
	last_hidden_state=x
	)