SubspaceDecoder_mla192-96-0 / configuration_shared_subspace_decoder.py

Up one level

86fbb38 verified 3 months ago

12 kB

	"""# `shared_space_config.py`

	#### `*Config`
	"""

	from typing import Optional

	import torch
	from torch import nn

	from transformers.configuration_utils import PretrainedConfig
	from transformers.modeling_utils import PreTrainedModel

	"""`def make_shorthand`"""

	def make_shorthand(model_cfg):
	"""
	Takes an instance subencoder `*Config` and constructs a shorthand
	name for the model based on settings.
	"""

	dense_str = str(model_cfg.num_dense_layers) + "mha + "

	if model_cfg.o_shared_dim is not None:
	o_str = "." + str(model_cfg.o_shared_dim)
	else:
	o_str = ""

	# If no output subspace is used, the dimension will show as -1.
	attn_str = (
	dense_str
	+ "mla."
	+ str(model_cfg.q_shared_dim)
	+ "."
	+ str(model_cfg.kv_shared_dim)
	+ o_str
	)

	# MLP Configuration
	if model_cfg.ffn_decompose:
	dense_str = (
	str(model_cfg.num_dense_layers)
	+ "mlp."
	+ str(model_cfg.intermediate_size)
	+ " + "
	)

	mlp_str = (
	dense_str
	+ str(model_cfg.num_hidden_layers - model_cfg.num_dense_layers)
	+ "dcmp."
	+ "x"
	+ str(model_cfg.intermediate_size)
	+ "."
	+ str(model_cfg.ffn_rank)
	)
	else:
	mlp_str = "mlp." + str(model_cfg.intermediate_size)

	# Assemble string
	shorthand = (
	f"{attn_str} - {mlp_str} - "
	f"h{model_cfg.hidden_size} - l{model_cfg.num_hidden_layers}"
	)

	"""
	The run name includes training settings

	run_name = (
	f"{config['stats']['total_elements']} - "
	f"{attn_str} - {mlp_str} - "
	f"h{model_cfg.hidden_size} - l{model_cfg.num_hidden_layers} - "
	f"bs{ptrain_cfg['train_batch_size']} - lr{lr_str} - "
	f"seq{ptrain_cfg['max_seq_length']}"
	)
	"""

	return shorthand


	class SharedSpaceDecoderConfig(PretrainedConfig):
	r"""
	Configuration class for SharedSpaceDecoderConfig.

	Extends the HuggingFace `PretrainedConfig` to support architectural
	variations including:
	- Multi-Head Latent Attention (MLA)
	- Decomposed MLPs (low-rank FFNs)
	- Flexible attention backends (eager, flash, sdpa)
	- Explicit shared subspaces for Q, K, V, and O projections

	This config does not infer any defaults based on `hidden_size`. All
	dimensions and ranks must be explicitly specified. If required values are
	missing, a `ValueError` is raised during initialization.

	----------------------
	Core Model Parameters:
	----------------------
	- vocab_size (`int`) — Vocabulary size.
	- hidden_size (`int`) — Model hidden dimension.
	- num_hidden_layers (`int`) — Number of transformer blocks.
	- intermediate_size (`int`) — Feed-forward hidden dimension.
	- hidden_act (`str`) — Activation function.
	- hidden_dropout_prob (`float`) — Dropout after projections and FFNs.
	- attention_dropout_prob (`float`) — Dropout applied to attention scores.
	- max_position_embeddings (`int`) — Max sequence length.
	- initializer_range (`float`) — Stddev of weight init.

	- layer_norm_eps (`float`) — Epsilon for LayerNorm.
	- rms_norm_ps (`float`) — Epsilon for RMSNorm

	- classifier_dropout (`float` or None) — Dropout for final classifier.

	- vocab_subspace
	- vocab_rank

	----------------------------------
	Multi-Head Latent Attention (MLA):
	----------------------------------
	- num_attention_heads (`int`) — Number of attention heads.

	- q_shared_dim (`int`) — Rank of the shared query subspace.
	- kv_shared_dim (`int`) — Rank of the shared key/value subspace.

	- output_subspace (`bool`) — Whether to use a shared latent subspace for output projections.
	- o_shared_dim (`int`) — Rank of the shared output subspace (required if `output_subspace=True`).
	- qk_private_dim (`int`) — Query/key private dimension per head.
	- vo_private_dim (`int`) — Value/output private dimension per head.

	- rope_dims (`int`) — Number of head dimensions carrying RoPE.
	- nope_dims (`int`) — Non-positional encoding dimensions.
	- rope_theta (`float`) — Base frequency used for RoPE.
	- rope_scaling (`dict` or None) — HF-style scaling dict for RoPE.
	- attention_bias (`bool`) — Whether to include bias terms in Q/K/V projections.
	- num_dense_layers (`int`) — Number of leading layers that do not use
	subspaces for attention or FFNs.
	- attention_backend (`str`) — Must be one of `"eager"`, `"flash_attention_2"`, or `"sdpa"`.

	----------------------
	Decomposed MLP (Low-Rank FFN):
	----------------------
	- ffn_decompose (`bool`) — Whether to enable low-rank FFNs.
	- ffn_rank (`int`) — Rank of the shared FFN latent space (required if `ffn_decompose=True`).

	----------------------
	Validation Behavior:
	----------------------
	Raises `ValueError` at init time if:
	- FFN decomposition is enabled without specifying `ffn_rank`.
	- An unknown `attention_backend` is provided.
	"""

	model_type = "shared_subspace_decoder"

	def __init__(
	self,

	# === Core Model ===
	vocab_size: int = 30522,
	hidden_size: int = 512,
	num_hidden_layers: int = 12,

	intermediate_size: int = 3072,

	hidden_dropout_prob=0.1,
	attention_dropout_prob=0.1,
	max_position_embeddings: int = 2048,
	initializer_range=0.02,
	layer_norm_eps=1e-12,
	rms_norm_eps=1e-6, # Their default, but confirm in config.
	norm_type="layernorm", # Choice between "layernorm" and "rmsnorm"
	classifier_dropout=None,

	vocab_subspace=False,
	vocab_rank=None,
	tie_word_embeddings=True,

	# === Multi-Head Latent Attention ===
	num_attention_heads: int = 16,
	rope_dims: int = 16,

	q_shared_dim: int = None,
	kv_shared_dim: int = None,

	o_shared_dim=None, # If None, no output subspace is used

	# Private head dimensions
	qk_private_dim: int = None, # Query/key private dimension per head
	vo_private_dim: int = None, # Value/output private dimension per head
	nope_dims: int = None, # Non-positional encoding dimensions

	attention_backend="eager",
	rope_theta=10000.0,
	rope_scaling=None,
	attention_bias=False,

	# === MLA Composition ===
	num_dense_layers=12, # dense MHA layers before MLA starts

	# === Decomposed MLP ===
	ffn_decompose=False,
	ffn_rank=None,
	**kwargs
	) -> None:
	super().__init__(**kwargs)



	# === Core Model ===
	self.vocab_size = vocab_size
	self.hidden_size = hidden_size
	self.num_hidden_layers = num_hidden_layers
	self.intermediate_size = intermediate_size
	self.hidden_dropout_prob = hidden_dropout_prob
	self.attention_dropout_prob = attention_dropout_prob
	self.max_position_embeddings = max_position_embeddings
	self.initializer_range = initializer_range
	self.layer_norm_eps = layer_norm_eps
	self.rms_norm_eps = rms_norm_eps
	self.norm_type = norm_type
	self.classifier_dropout = classifier_dropout

	self.vocab_subspace = vocab_subspace
	self.vocab_rank = vocab_rank
	self.tie_word_embeddings = tie_word_embeddings

	# === MLA ===
	self.num_attention_heads = num_attention_heads
	self.rope_dims = rope_dims

	self.q_shared_dim = q_shared_dim
	self.kv_shared_dim = kv_shared_dim
	self.o_shared_dim = o_shared_dim

	# Private head dimensions
	self.qk_private_dim = qk_private_dim
	self.vo_private_dim = vo_private_dim
	self.nope_dims = nope_dims
	self.rope_theta = rope_theta
	self.rope_scaling = rope_scaling
	self.attention_bias = attention_bias
	self.num_dense_layers = num_dense_layers

	# === Decomposed FFN ===
	self.ffn_decompose = ffn_decompose
	self.ffn_rank = ffn_rank

	# === Attention backend ===
	self.attention_backend = attention_backend

	# === Validation ===
	# TODO - Somewhere during training these get instantiated with bad
	# values...
	#self._validate()

	#print(f" > SubEnc *Config.init: {make_shorthand(self)}\n")


	def _validate(self):
	# === Model ===
	if self.num_dense_layers > self.num_hidden_layers:
	raise ValueError("`num_dense_layers` must be <= `num_hidden_layers`")
	if self.vocab_subspace and self.vocab_rank is None:
	raise ValueError("`vocab_rank` must be set when `vocab_subspace=True`")

	# === MLA Validation ===
	# At least one of q_shared_dim or kv_shared_dim must be set if we have subspace layers
	if self.num_dense_layers < self.num_hidden_layers and self.q_shared_dim is None and self.kv_shared_dim is None:
	raise ValueError("At least one of q_shared_dim or kv_shared_dim must be set when there are subspace layers")

	# Validate that private dimensions are set
	if self.qk_private_dim is None or self.vo_private_dim is None:
	raise ValueError("Must set qk_private_dim and vo_private_dim")
	if self.nope_dims is None:
	raise ValueError("Must set nope_dims")

	# === Decomposed FFN ===
	if self.ffn_decompose and self.ffn_rank is None:
	raise ValueError("`ffn_rank` must be set when `ffn_decompose=True`")
	if self.ffn_decompose and self.num_dense_layers >= self.num_hidden_layers:
	raise ValueError("`ffn_decompose` was set but `num_dense` is >= number of layers")

	# === Attention Backend ===
	valid_backends = ["eager", "flash_attention_2", "sdpa"]
	if self.attention_backend not in valid_backends:
	raise ValueError(f"Unknown attention backend: {self.attention_backend}, options are {valid_backends}")

	# === Norm Type ===
	valid_norm_types = ["layernorm", "rmsnorm"]
	if self.norm_type not in valid_norm_types:
	raise ValueError(f"Unknown norm type: {self.norm_type}, options are {valid_norm_types}")

	#### `get_config`

	import json

	def get_config(filename):

	# Load the config file.
	with open(filename) as f:
	full_cfg = json.load(f)

	# Strict key check on the model configuration.

	# Get the list of keys allowed / required by `*Config`
	valid_keys = SharedSpaceDecoderConfig.__init__.__code__.co_varnames
	# Remove `self` and `kwargs`
	valid_keys = set(valid_keys) - {"self", "kwargs"}

	# Compare the set of keys in the json file vs `*Config`
	extra_keys = set(full_cfg["model"]) - valid_keys
	missing_keys = valid_keys - set(full_cfg["model"])

	# If there any in the `json` that aren't in `*Config`,
	if extra_keys:
	# List them for the user.
	raise ValueError(f"Unknown keys in config: {sorted(extra_keys)}")

	# If the json config is missing required keys,
	if missing_keys:
	# List them for the user.
	raise ValueError(f"config json is missing: {sorted(missing_keys)}")

	# Will raise TypeError, by design, if required args are missing
	# The asterisks unpack the dictionary into a list of keywords as though
	# all of the settings were writting out individually.
	model_cfg = SharedSpaceDecoderConfig(**full_cfg["model"])

	return full_cfg, model_cfg