referans_kod Colab için

ad9632f verified 29 days ago

9.07 kB

	"""Mamba-3 LM assembly that wires the official mamba-og Mamba3 mixer into a
	prenorm Llama-style stack. The reason we don't reuse `mixer_seq_simple.py`
	verbatim: its `create_block` only knows about Mamba1/Mamba2. We replicate the
	same Block layout and weight init so the model is structurally identical to
	what the Mamba-3 authors trained, just with Mamba3 as the mixer."""

	from __future__ import annotations

	import math
	from collections import namedtuple
	from functools import partial
	from pathlib import Path
	import sys

	import torch
	import torch.nn as nn

	# Make mamba-og importable without installing it.
	_REPO_ROOT = Path(__file__).resolve().parents[1]
	_MAMBA_OG = _REPO_ROOT / "mamba-og"
	if str(_MAMBA_OG) not in sys.path:
	sys.path.insert(0, str(_MAMBA_OG))

	from mamba_ssm.modules.block import Block
	from mamba_ssm.modules.mamba3 import Mamba3
	from mamba_ssm.modules.mlp import GatedMLP
	from mamba_ssm.ops.triton.layer_norm import RMSNorm, layer_norm_fn

	CausalLMOutput = namedtuple("CausalLMOutput", ["logits", "loss"])


	def _init_weights(
	module: nn.Module,
	n_layer: int,
	initializer_range: float = 0.02,
	rescale_prenorm_residual: bool = True,
	n_residuals_per_layer: int = 1,
	):
	if isinstance(module, nn.Linear):
	if module.bias is not None and not getattr(module.bias, "_no_reinit", False):
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	nn.init.normal_(module.weight, std=initializer_range)

	if rescale_prenorm_residual:
	for name, p in module.named_parameters():
	if name in ("out_proj.weight", "fc2.weight"):
	nn.init.kaiming_uniform_(p, a=math.sqrt(5))
	with torch.no_grad():
	p /= math.sqrt(n_residuals_per_layer * n_layer)


	def _build_block(
	layer_idx: int,
	d_model: int,
	d_intermediate: int,
	ssm_cfg: dict,
	norm_epsilon: float,
	rms_norm: bool,
	residual_in_fp32: bool,
	fused_add_norm: bool,
	device,
	dtype,
	):
	factory_kwargs = {"device": device, "dtype": dtype}

	mixer_cls = partial(Mamba3, layer_idx=layer_idx, ssm_cfg, factory_kwargs)

	norm_cls = partial(
	RMSNorm if rms_norm else nn.LayerNorm, eps=norm_epsilon, **factory_kwargs
	)

	if d_intermediate == 0:
	mlp_cls = nn.Identity
	else:
	mlp_cls = partial(
	GatedMLP,
	hidden_features=d_intermediate,
	out_features=d_model,
	**factory_kwargs,
	)

	block = Block(
	d_model,
	mixer_cls,
	mlp_cls,
	norm_cls=norm_cls,
	fused_add_norm=fused_add_norm,
	residual_in_fp32=residual_in_fp32,
	)
	block.layer_idx = layer_idx
	return block


	class Mamba3LM(nn.Module):
	"""Mamba-3 language model (prenorm stack + tied lm_head)."""

	def __init__(
	self,
	d_model: int,
	n_layer: int,
	d_intermediate: int,
	vocab_size: int,
	ssm_cfg: dict,
	rms_norm: bool = True,
	residual_in_fp32: bool = True,
	fused_add_norm: bool = True,
	norm_epsilon: float = 1e-5,
	pad_vocab_multiple: int = 8,
	tie_embeddings: bool = True,
	initializer_range: float = 0.02,
	device=None,
	dtype=None,
	):
	super().__init__()
	if vocab_size % pad_vocab_multiple != 0:
	vocab_size += pad_vocab_multiple - (vocab_size % pad_vocab_multiple)
	self.vocab_size = vocab_size
	self.tie_embeddings = tie_embeddings
	self.residual_in_fp32 = residual_in_fp32
	self.fused_add_norm = fused_add_norm

	factory_kwargs = {"device": device, "dtype": dtype}
	self.embedding = nn.Embedding(vocab_size, d_model, **factory_kwargs)
	self.layers = nn.ModuleList(
	[
	_build_block(
	layer_idx=i,
	d_model=d_model,
	d_intermediate=d_intermediate,
	ssm_cfg=ssm_cfg,
	norm_epsilon=norm_epsilon,
	rms_norm=rms_norm,
	residual_in_fp32=residual_in_fp32,
	fused_add_norm=fused_add_norm,
	device=device,
	dtype=dtype,
	)
	for i in range(n_layer)
	]
	)
	self.norm_f = (RMSNorm if rms_norm else nn.LayerNorm)(
	d_model, eps=norm_epsilon, **factory_kwargs
	)
	self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)

	self.apply(
	partial(
	_init_weights,
	n_layer=n_layer,
	initializer_range=initializer_range,
	n_residuals_per_layer=1 if d_intermediate == 0 else 2,
	)
	)
	if tie_embeddings:
	self.lm_head.weight = self.embedding.weight

	def forward(self, input_ids, labels=None):
	hidden_states = self.embedding(input_ids)
	residual = None
	for layer in self.layers:
	hidden_states, residual = layer(hidden_states, residual)

	if not self.fused_add_norm:
	residual = (hidden_states + residual) if residual is not None else hidden_states
	hidden_states = self.norm_f(residual.to(self.norm_f.weight.dtype))
	else:
	hidden_states = layer_norm_fn(
	hidden_states,
	self.norm_f.weight,
	self.norm_f.bias,
	eps=self.norm_f.eps,
	residual=residual,
	prenorm=False,
	residual_in_fp32=self.residual_in_fp32,
	is_rms_norm=isinstance(self.norm_f, RMSNorm),
	)

	logits = self.lm_head(hidden_states)

	loss = None
	if labels is not None:
	shift_logits = logits[:, :-1, :].contiguous()
	shift_labels = labels[:, 1:].contiguous()
	loss = nn.functional.cross_entropy(
	shift_logits.view(-1, shift_logits.size(-1)).float(),
	shift_labels.view(-1),
	ignore_index=-100,
	)

	return CausalLMOutput(logits=logits, loss=loss)

	def num_params(self, trainable_only: bool = True) -> int:
	params = (p for p in self.parameters() if (p.requires_grad or not trainable_only))
	# When embeddings are tied, the underlying tensor is shared — count once.
	seen = set()
	total = 0
	for p in params:
	if id(p) in seen:
	continue
	seen.add(id(p))
	total += p.numel()
	return total


	def build_model_from_config(cfg: dict, device=None, dtype=None) -> Mamba3LM:
	"""Translate a parsed YAML config into a Mamba3LM.

	If ``architecture.bc_stabilizer`` is present and not ``"bcnorm"``, the
	stock RMSNormGated B/C normalizers are swapped out for the named
	element-wise stabilizer (DySoftSign, DyT, DyISRU, Derf, ...). The swap
	happens after model construction so the rest of the mixer (in_proj,
	biases, RoPE, SSD kernel) is bit-identical to the BCNorm baseline.
	"""
	m = cfg["model"]
	a = cfg["architecture"]
	k = cfg["kernels"]

	ssm_cfg = dict(
	d_state=m["d_state"],
	expand=m["expand"],
	headdim=m["head_dim"],
	ngroups=m["ngroups"],
	rope_fraction=a["rope_fraction"],
	is_outproj_norm=a["is_outproj_norm"],
	is_mimo=a["is_mimo"],
	mimo_rank=a["mimo_rank"],
	chunk_size=k["chunk_size"],
	)

	model = Mamba3LM(
	d_model=m["d_model"],
	n_layer=m["n_layers"],
	d_intermediate=m["d_intermediate"],
	vocab_size=m["vocab_size"],
	ssm_cfg=ssm_cfg,
	rms_norm=m["rms_norm"],
	residual_in_fp32=m["residual_in_fp32"],
	fused_add_norm=m["fused_add_norm"],
	norm_epsilon=m["norm_epsilon"],
	pad_vocab_multiple=m["pad_vocab_multiple"],
	tie_embeddings=m["tie_embeddings"],
	initializer_range=m["initializer_range"],
	device=device,
	dtype=dtype,
	)

	stabilizer = str(a.get("bc_stabilizer", "bcnorm")).lower()
	if stabilizer != "bcnorm":
	if str(_REPO_ROOT / "src") not in sys.path:
	sys.path.insert(0, str(_REPO_ROOT / "src"))
	from nfmamba.adapters.bc_stabilizer import install_bc_stabilizer

	report = install_bc_stabilizer(
	model,
	name=stabilizer,
	stabilize_b=bool(a.get("stabilize_b", True)),
	stabilize_c=bool(a.get("stabilize_c", True)),
	squash_before_bias=bool(a.get("squash_before_bias", False)),
	)
	print(
	f"[model] BC stabilizer = {report.name!r} "
	f"(replaced={report.replaced}, B={report.stabilize_b}, C={report.stabilize_c}, "
	f"squash_before_bias={report.squash_before_bias})",
	flush=True,
	)

	if dtype is not None:
	model.to(dtype=dtype)
	if device is not None:
	model.to(device=device)

	return model