HypeNet-2B / modeling_hybrid.py

Upload folder using huggingface_hub

d9612e3 verified 2 months ago

26.8 kB

	from typing import Optional, Tuple, Union, List, Dict, Any
	from functools import partial

	import torch
	from torch import nn, Tensor
	from torch.utils.checkpoint import checkpoint

	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
	from transformers.generation import GenerationMixin
	from transformers.modeling_attn_mask_utils import AttentionMaskConverter
	from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
	from transformers.modeling_outputs import (
	BaseModelOutputWithPast,
	CausalLMOutputWithPast,
	)
	from cut_cross_entropy import linear_cross_entropy
	from transformers.modeling_utils import PreTrainedModel
	from transformers.processing_utils import Unpack
	from transformers.utils import auto_docstring, can_return_tuple, logging, is_torch_flex_attn_available
	from .configuration_hybrid import HybridConfig
	from .modeling_qwen3 import Qwen3RMSNorm, Qwen3Attention, Qwen3MLP, Qwen3RotaryEmbedding
	# from .gdn import GatedDeltaNet
	# from .mamba2 import Mamba2Mixer
	from .lightning_attn import LightningAttention
	from .cache import HybridCache
	# from .kda import KimiDeltaAttention

	if is_torch_flex_attn_available():
	from torch.nn.attention.flex_attention import BlockMask

	from transformers.integrations.flex_attention import make_flex_block_causal_mask


	logger = logging.get_logger(__name__)


	class HybridDecoderLayer(nn.Module):
	def __init__(self, config: HybridConfig, layer_idx: int):
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	self.layer_idx = layer_idx
	mixer_type = config.mixer_types[layer_idx]
	self.mixer_type = mixer_type
	if mixer_type == 'attn':
	self.self_attn = Qwen3Attention(
	config=config,
	layer_idx=layer_idx,
	)
	elif mixer_type == 'mamba2':
	self.self_attn = Mamba2Mixer(
	layer_idx=layer_idx,
	hidden_size=config.hidden_size,
	num_heads=config.num_attention_heads,
	n_groups=config.mamba2_n_groups,
	expand_ratio=config.mamba2_expand_ratio,
	conv_kernel=config.mamba2_conv_kernel,
	state_size=config.head_dim,
	head_dim=config.head_dim,
	use_bias=config.mamba2_bias,
	hidden_act=config.mamba2_hidden_act,
	)
	elif mixer_type == 'gdn':
	self.self_attn = GatedDeltaNet(
	layer_idx=layer_idx,
	hidden_size=config.hidden_size,
	expand_v=config.gdn_expand_v,
	num_heads=config.gdn_nh,
	num_kv_heads=config.gdn_nkv,
	key_dim=config.head_dim,
	val_dim=config.head_dim,
	use_gate=config.gdn_use_gate,
	use_short_conv=config.gdn_use_short_conv,
	activation=config.gdn_activation,
	qk_norm=config.gdn_use_qk_norm,
	use_rope=config.gdn_use_rope,
	)
	elif mixer_type == 'gla':
	raise NotImplementedError("GatedLightningAttention is not implemented")
	self.self_attn = GatedLinearAttention(config=config, layer_idx=layer_idx)
	elif mixer_type in ['lightning-attn', 'lightning_attn']:
	# raise NotImplementedError("LightningAttention is not implemented")
	self.self_attn = LightningAttention(
	layer_idx=layer_idx,
	hidden_size=config.hidden_size,
	num_attention_heads=config.lightning_nh,
	num_key_value_heads=config.lightning_nkv,
	head_dim=config.lightning_head_dim,
	attention_dropout=config.attention_dropout,
	use_output_gate=config.lightning_use_output_gate,
	attention_bias=config.attention_bias,
	rms_norm_eps=config.rms_norm_eps,
	use_rope=config.lightning_use_rope,
	use_output_norm=config.lightning_use_output_norm,
	qk_norm=config.lightning_use_qk_norm,
	scale=config.lightning_scale,
	use_short_conv=config.lightning_use_short_conv,
	conv_size=config.lightning_conv_size,
	)
	elif mixer_type == 'kda':
	self.self_attn = KimiDeltaAttention(config=config, layer_idx=layer_idx)
	elif mixer_type == 'rwkv7':
	raise NotImplementedError("RWKV7Attention is not implemented")
	# self.self_attn = RWKV7Attention(config=config, layer_idx=layer_idx)
	else:
	raise ValueError(f"Invalid mixer type: {mixer_type}")
	self.mlp = Qwen3MLP(config)
	self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	if (
	config.sliding_window and config._attn_implementation != "flash_attention_2"
	): # diff with Llama is this warning
	logger.warning_once(
	f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
	"unexpected results may be encountered."
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	output_attentions: Optional[bool] = False,
	use_cache: Optional[bool] = False,
	cache_position: Optional[torch.LongTensor] = None,
	position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
	**kwargs: Unpack[FlashAttentionKwargs],
	) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor, Cache]]]:

	# ==== Time mixing ====
	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)

	# Position embeddings, depends on mixer type and config
	if self.mixer_type == "attn" and not self.config.attn_use_rope:
	position_embeddings = None
	elif self.mixer_type == "lightning-attn" and not self.config.lightning_use_rope:
	position_embeddings = None
	elif self.mixer_type == "kda" and not self.config.kda_use_rope:
	position_embeddings = None
	elif self.mixer_type == "gdn" and not self.config.gdn_use_rope:
	position_embeddings = None

	# TODO: Also handle other kinds of token mixers
	hidden_states, self_attn_weights, past_key_values = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	output_attentions=output_attentions,
	use_cache=use_cache,
	cache_position=cache_position,
	position_embeddings=position_embeddings,
	**kwargs,
	)
	hidden_states = residual + hidden_states

	# ==== Channel mixing ====
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	outputs = (hidden_states, self_attn_weights, past_key_values)

	return outputs


	# @auto_docstring
	class HybridPreTrainedModel(PreTrainedModel):
	config_class = HybridConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["HybridDecoderLayer"]
	_skip_keys_device_placement = ["past_key_values"]
	_supports_flash_attn_2 = True
	_supports_sdpa = True
	_supports_flex_attn = True
	_supports_cache_class = True
	_supports_quantized_cache = True
	_supports_static_cache = True
	_supports_attention_backend = True

	def _init_weights(self, module: nn.Module):
	std = self.config.initializer_range
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, Qwen3RMSNorm):
	module.weight.data.fill_(1.0)


	# @auto_docstring
	class HybridModel(HybridPreTrainedModel):
	def __init__(self, config: HybridConfig):
	super().__init__(config)
	self.padding_idx = config.pad_token_id
	self.vocab_size = config.vocab_size

	self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
	self.layers = nn.ModuleList(
	[HybridDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
	)
	self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.rotary_emb = Qwen3RotaryEmbedding(config=config)
	self.gradient_checkpointing = False

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.embed_tokens

	def set_input_embeddings(self, value):
	self.embed_tokens = value

	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	**flash_attn_kwargs: Unpack[FlashAttentionKwargs],
	) -> BaseModelOutputWithPast:
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache

	if (input_ids is None) ^ (inputs_embeds is not None):
	raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

	if self.gradient_checkpointing and self.training and use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
	)
	use_cache = False

	# TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
	if not isinstance(past_key_values, (type(None), Cache)):
	raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")

	if inputs_embeds is None:
	inputs_embeds = self.embed_tokens(input_ids)

	if use_cache:
	if past_key_values is None or isinstance(past_key_values, DynamicCache):
	past_key_values = HybridCache()

	if cache_position is None:
	past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
	cache_position = torch.arange(
	past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
	)

	if position_ids is None:
	position_ids = cache_position.unsqueeze(0)

	causal_mask = self._update_causal_mask(
	attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
	)

	hidden_states = inputs_embeds

	# create position embeddings to be shared across the decoder layers
	position_embeddings = self.rotary_emb(hidden_states, position_ids)

	# decoder layers
	all_hidden_states = () if output_hidden_states else None
	all_self_attns = () if output_attentions else None

	for decoder_layer in self.layers:
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	if self.gradient_checkpointing and self.training:
	layer_fwd = partial(
	checkpoint,
	decoder_layer,
	use_reentrant=False,
	)
	else:
	layer_fwd = decoder_layer

	layer_outputs = layer_fwd(
	hidden_states,
	attention_mask=causal_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	output_attentions=output_attentions,
	use_cache=use_cache,
	cache_position=cache_position,
	position_embeddings=position_embeddings,
	**flash_attn_kwargs,
	)

	hidden_states = layer_outputs[0]

	if output_attentions:
	all_self_attns += (layer_outputs[1],)

	hidden_states = self.norm(hidden_states)

	# add hidden states from the last decoder layer
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	return BaseModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=past_key_values if use_cache else None,
	hidden_states=all_hidden_states,
	attentions=all_self_attns,
	)

	def _update_causal_mask(
	self,
	attention_mask: Union[torch.Tensor, "BlockMask"],
	input_tensor: torch.Tensor,
	cache_position: torch.Tensor,
	past_key_values: Cache,
	output_attentions: bool = False,
	):
	if self.config._attn_implementation == "flash_attention_2":
	if attention_mask is not None and past_key_values is not None:
	is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
	if is_padding_right:
	raise ValueError(
	"You are attempting to perform batched generation with padding_side='right'"
	" this may lead to unexpected behaviour for Flash Attention version of Hybrid. Make sure to "
	" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
	)
	if attention_mask is not None and 0.0 in attention_mask:
	return attention_mask
	return None
	if self.config._attn_implementation == "flex_attention":
	if isinstance(attention_mask, torch.Tensor):
	attention_mask = make_flex_block_causal_mask(attention_mask)
	return attention_mask

	# For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
	# order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
	# to infer the attention mask.
	past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
	using_static_cache = isinstance(past_key_values, StaticCache)
	using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)

	# When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
	if (
	self.config._attn_implementation == "sdpa"
	and not (using_static_cache or using_sliding_window_cache)
	and not output_attentions
	):
	if AttentionMaskConverter._ignore_causal_mask_sdpa(
	attention_mask,
	inputs_embeds=input_tensor,
	past_key_values_length=past_seen_tokens,
	sliding_window=self.config.sliding_window,
	is_training=self.training,
	):
	return None

	dtype = input_tensor.dtype
	min_dtype = torch.finfo(dtype).min
	sequence_length = input_tensor.shape[1]
	# SlidingWindowCache or StaticCache
	if using_sliding_window_cache or using_static_cache:
	target_length = past_key_values.get_max_cache_shape()
	# DynamicCache or no cache
	else:
	target_length = (
	attention_mask.shape[-1]
	if isinstance(attention_mask, torch.Tensor)
	else past_seen_tokens + sequence_length + 1
	)

	# In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
	causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
	attention_mask,
	sequence_length=sequence_length,
	target_length=target_length,
	dtype=dtype,
	cache_position=cache_position,
	batch_size=input_tensor.shape[0],
	config=self.config,
	past_key_values=past_key_values,
	)

	if (
	self.config._attn_implementation == "sdpa"
	and attention_mask is not None
	and attention_mask.device.type in ["cuda", "xpu", "npu"]
	and not output_attentions
	):
	# Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
	# using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
	# Details: https://github.com/pytorch/pytorch/issues/110213
	causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)

	return causal_mask

	@staticmethod
	def _prepare_4d_causal_attention_mask_with_cache_position(
	attention_mask: torch.Tensor,
	sequence_length: int,
	target_length: int,
	dtype: torch.dtype,
	cache_position: torch.Tensor,
	batch_size: int,
	config: HybridConfig,
	past_key_values: Cache,
	):
	"""
	Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
	`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

	Args:
	attention_mask (`torch.Tensor`):
	A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
	sequence_length (`int`):
	The sequence length being processed.
	target_length (`int`):
	The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
	dtype (`torch.dtype`):
	The dtype to use for the 4D attention mask.
	cache_position (`torch.Tensor`):
	Indices depicting the position of the input sequence tokens in the sequence.
	batch_size (`torch.Tensor`):
	Batch size.
	config (`HybridConfig`):
	The model's configuration class
	past_key_values (`Cache`):
	The cache class that is being used currently to generate
	"""
	if attention_mask is not None and attention_mask.dim() == 4:
	# In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
	causal_mask = attention_mask
	else:
	min_dtype = torch.finfo(dtype).min
	causal_mask = torch.full(
	(sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
	)
	diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
	-1, 1
	)
	text_config = config.get_text_config()
	if getattr(text_config, "use_sliding_window", True) and text_config.sliding_window is not None:
	# if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
	# the check is needed to verify is current checkpoint was trained with sliding window or not
	if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
	sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
	cache_position.reshape(-1, 1) - text_config.sliding_window
	)
	diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
	causal_mask *= diagonal_attend_mask
	causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
	if attention_mask is not None:
	causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
	if attention_mask.shape[-1] > target_length:
	attention_mask = attention_mask[:, :target_length]
	mask_length = attention_mask.shape[-1]
	padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
	causal_mask.device
	)
	padding_mask = padding_mask == 0
	causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
	padding_mask, min_dtype
	)
	return causal_mask


	class KwargsForCausalLM(FlashAttentionKwargs): ...


	# @auto_docstring
	class HybridForCausalLM(HybridPreTrainedModel, GenerationMixin):
	_tied_weights_keys = ["lm_head.weight"]
	_tp_plan = {"lm_head": "colwise_rep"}
	_pp_plan = {"lm_head": (["hidden_states"], ["logits"])}

	def __init__(self, config: HybridConfig):
	super().__init__(config)
	self.model = HybridModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

	self.use_cce = True
	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def set_decoder(self, decoder):
	self.model = decoder

	def get_decoder(self):
	return self.model

	# @can_return_tuple
	# @auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	logits_to_keep: Union[int, torch.Tensor] = 0,
	return_logits: bool = False,
	**kwargs: Unpack[KwargsForCausalLM],
	) -> CausalLMOutputWithPast:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
	config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
	(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

	Example:

	```python
	>>> from transformers import AutoTokenizer, HybridForCausalLM

	>>> model = HybridForCausalLM.from_pretrained("Qwen/Hybrid-8B")
	>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Hybrid-8B")

	>>> prompt = "Hey, are you conscious? Can you talk to me?"
	>>> inputs = tokenizer(prompt, return_tensors="pt")

	>>> # Generate
	>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
	>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
	"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
	```"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs: BaseModelOutputWithPast = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	cache_position=cache_position,
	**kwargs,
	)

	hidden_states: Tensor = outputs.last_hidden_state
	loss = None
	logits = None
	if return_logits or not self.training:
	# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
	slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
	logits = self.lm_head(hidden_states[:, slice_indices, :])

	if labels is not None:
	labels = labels.to(hidden_states.device)
	if self.use_cce:
	loss = linear_cross_entropy(
	hidden_states,
	self.lm_head.weight,
	labels,
	shift=True,
	)
	else:
	logits = self.lm_head(hidden_states).to(torch.float32)
	loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)