MiniMind / model /mind2_model.py

MiniMind Max2 - Efficient MoE Language Model

8b187bb verified 25 days ago

7.6 kB

	"""
	MiniMind Max2 Main Model
	Complete implementation of the Max2 language model.
	"""

	from typing import List, Optional, Tuple
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn import CrossEntropyLoss

	import sys
	from pathlib import Path
	sys.path.insert(0, str(Path(__file__).parent.parent))
	from configs.model_config import Max2Config, get_config
	from .components import Max2DecoderLayer, Max2RMSNorm


	class Max2Model(nn.Module):
	"""Max2 Transformer Model - outputs raw hidden states."""

	def __init__(self, config: Max2Config):
	super().__init__()
	self.config = config
	self.padding_idx = config.pad_token_id
	self.vocab_size = config.vocab_size

	self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
	self.layers = nn.ModuleList([Max2DecoderLayer(config, i) for i in range(config.num_hidden_layers)])
	self.norm = Max2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	self.gradient_checkpointing = False
	self._init_weights()

	def _init_weights(self):
	for module in self.modules():
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)

	def _make_causal_mask(self, seq_len: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
	mask = torch.full((seq_len, seq_len), float("-inf"), dtype=dtype, device=device)
	mask = torch.triu(mask, diagonal=1)
	return mask.unsqueeze(0).unsqueeze(0)

	def forward(
	self,
	input_ids: torch.LongTensor,
	attention_mask: Optional[torch.Tensor] = None,
	past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
	use_cache: bool = False,
	) -> Tuple[torch.Tensor, Optional[List[Tuple[torch.Tensor, torch.Tensor]]], torch.Tensor]:
	batch_size, seq_len = input_ids.shape
	hidden_states = self.embed_tokens(input_ids)

	causal_mask = self._make_causal_mask(seq_len, hidden_states.dtype, hidden_states.device)
	if attention_mask is not None:
	padding_mask = (1.0 - attention_mask[:, None, None, :].to(hidden_states.dtype)) * float("-inf")
	causal_mask = causal_mask + padding_mask

	next_cache = [] if use_cache else None
	total_aux_loss = torch.tensor(0.0, device=hidden_states.device)

	for idx, layer in enumerate(self.layers):
	past_kv = past_key_values[idx] if past_key_values else None
	hidden_states, present_kv, aux_loss = layer(hidden_states, causal_mask, past_kv, use_cache)

	if use_cache:
	next_cache.append(present_kv)
	total_aux_loss = total_aux_loss + aux_loss

	hidden_states = self.norm(hidden_states)
	return hidden_states, next_cache, total_aux_loss


	class Max2ForCausalLM(nn.Module):
	"""Max2 Model with Language Modeling head for text generation."""

	def __init__(self, config: Max2Config):
	super().__init__()
	self.config = config
	self.model = Max2Model(config)
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
	self.lm_head.weight = self.model.embed_tokens.weight

	def forward(
	self,
	input_ids: torch.LongTensor,
	attention_mask: Optional[torch.Tensor] = None,
	labels: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
	use_cache: bool = False,
	) -> Tuple[Optional[torch.Tensor], torch.Tensor, Optional[List], torch.Tensor]:
	hidden_states, next_cache, aux_loss = self.model(input_ids, attention_mask, past_key_values, use_cache)
	logits = self.lm_head(hidden_states).float()

	loss = None
	if labels is not None:
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	loss = CrossEntropyLoss()(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
	loss = loss + aux_loss

	return loss, logits, next_cache, aux_loss

	@torch.no_grad()
	def generate(
	self,
	input_ids: torch.LongTensor,
	max_new_tokens: int = 100,
	temperature: float = 1.0,
	top_k: int = 50,
	top_p: float = 0.95,
	do_sample: bool = True,
	) -> torch.LongTensor:
	"""Simple generation with top-k/top-p sampling."""
	generated = input_ids
	past_key_values = None

	for _ in range(max_new_tokens):
	if past_key_values is None:
	_, logits, past_key_values, _ = self(generated, use_cache=True)
	else:
	_, logits, past_key_values, _ = self(generated[:, -1:], past_key_values=past_key_values, use_cache=True)

	next_token_logits = logits[:, -1, :] / temperature

	if do_sample:
	if top_k > 0:
	indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
	next_token_logits[indices_to_remove] = float('-inf')

	if top_p < 1.0:
	sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
	cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
	sorted_indices_to_remove = cumulative_probs > top_p
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = 0
	indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
	next_token_logits[indices_to_remove] = float('-inf')

	probs = F.softmax(next_token_logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)
	else:
	next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)

	generated = torch.cat([generated, next_token], dim=1)

	if (next_token == self.config.eos_token_id).all():
	break

	return generated


	# Backward compatibility aliases
	Mind2Model = Max2Model
	Mind2ForCausalLM = Max2ForCausalLM


	def create_model(model_name: str = "max2-lite", device: str = "cuda", dtype: torch.dtype = torch.float16) -> Max2ForCausalLM:
	"""Factory function to create a Max2 model."""
	config = get_config(model_name)
	model = Max2ForCausalLM(config)
	return model.to(device=device, dtype=dtype) if torch.cuda.is_available() else model


	if __name__ == "__main__":
	for model_name in ["max2-nano", "max2-lite", "max2-pro"]:
	print(f"\n{'='50}\nTesting {model_name}\n{'='50}")
	config = get_config(model_name)
	model = Max2ForCausalLM(config)

	total_params = sum(p.numel() for p in model.parameters())
	print(f"Total Parameters: {total_params / 1e9:.3f}B")

	input_ids = torch.randint(0, config.vocab_size, (2, 128))
	model.eval()
	with torch.no_grad():
	loss, logits, _, aux_loss = model(input_ids, labels=input_ids)
	print(f"Logits shape: {logits.shape}")
	print(f"Loss: {loss:.4f}, Aux loss: {aux_loss:.6f}")
	print("Forward pass successful!")