Qwen3-Reranker-4B / model_executor /models /falcon_h1.py

update

4679932 8 months ago

26.9 kB

	# SPDX-License-Identifier: Apache-2.0
	# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
	"""Inference-only FalconH1 model."""
	from collections.abc import Iterable
	from typing import Optional

	import torch
	from torch import nn
	from transformers import FalconH1Config

	from vllm.attention.layer import Attention
	from vllm.config import CacheConfig, VllmConfig
	from vllm.distributed import divide, get_tensor_model_parallel_world_size
	from vllm.distributed.parallel_state import get_pp_group
	from vllm.forward_context import get_forward_context
	from vllm.model_executor.layers.activation import SiluAndMul
	from vllm.model_executor.layers.layernorm import RMSNorm
	from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
	QKVParallelLinear,
	RowParallelLinear)
	from vllm.model_executor.layers.logits_processor import LogitsProcessor
	from vllm.model_executor.layers.mamba.mamba2_metadata import (
	Mamba2Metadata, prepare_mamba2_metadata)
	from vllm.model_executor.layers.mamba.mamba_mixer2 import (
	MambaMixer2, extra_groups_for_head_shards)
	from vllm.model_executor.layers.quantization import QuantizationConfig
	from vllm.model_executor.layers.rotary_embedding import get_rope
	from vllm.model_executor.layers.vocab_parallel_embedding import (
	DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
	from vllm.model_executor.model_loader.weight_utils import default_weight_loader
	from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
	MambaCacheParams)
	from vllm.model_executor.sampling_metadata import SamplingMetadata
	from vllm.sequence import IntermediateTensors

	from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
	SupportsV0Only)
	from .utils import (PPMissingLayer, is_pp_missing_parameter,
	make_empty_intermediate_tensors_factory, make_layers,
	maybe_prefix)


	class FalconH1MLP(nn.Module):

	def __init__(
	self,
	config: FalconH1Config,
	quant_config: Optional[QuantizationConfig] = None,
	bias: bool = False,
	) -> None:
	super().__init__()
	self.gate_up_proj = MergedColumnParallelLinear(
	input_size=config.hidden_size,
	output_sizes=[config.intermediate_size] * 2,
	bias=bias,
	quant_config=quant_config,
	)
	self.down_proj = RowParallelLinear(
	input_size=config.intermediate_size,
	output_size=config.hidden_size,
	bias=bias,
	quant_config=quant_config,
	)
	self.tp_size = get_tensor_model_parallel_world_size()
	self.intermediate_size = config.intermediate_size
	self.gate_multiplier, self.down_multiplier = config.mlp_multipliers
	if config.hidden_act != "silu":
	raise ValueError(f"Unsupported activation: {config.hidden_act}. "
	"Only silu is supported for now.")
	self.act_fn = SiluAndMul()

	def forward(self, x):
	x, _ = self.gate_up_proj(x)
	x[:, :self.intermediate_size // self.tp_size] *= self.gate_multiplier
	x = self.act_fn(x)
	x, _ = self.down_proj(x)
	x = x * self.down_multiplier
	return x


	class FalconH1SSMDecoderLayer(nn.Module):

	def __init__(
	self,
	config: FalconH1Config,
	cache_config: Optional[CacheConfig] = None,
	quant_config: Optional[QuantizationConfig] = None,
	) -> None:
	super().__init__()
	self.config = config
	self.tp_size = get_tensor_model_parallel_world_size()

	self.d_ssm = (int(config.mamba_expand * config.hidden_size)
	if config.mamba_d_ssm is None else config.mamba_d_ssm)

	self.mamba = MambaMixer2(
	hidden_size=config.hidden_size,
	ssm_state_size=config.mamba_d_state,
	conv_kernel_size=config.mamba_d_conv,
	intermediate_size=self.d_ssm,
	use_conv_bias=config.mamba_conv_bias,
	use_bias=config.mamba_proj_bias,
	n_groups=config.mamba_n_groups,
	num_heads=config.mamba_n_heads,
	head_dim=config.mamba_d_head,
	rms_norm_eps=config.rms_norm_eps,
	activation=config.hidden_act,
	quant_config=quant_config,
	use_rms_norm=config.mamba_rms_norm,
	)
	# n_groups is overridden later by `MambaMixer2`
	self.groups_time_state_size = self.mamba.n_groups * config.mamba_d_state
	self.zxbcdt_multipliers = config.ssm_multipliers
	self._init_mup_vector()

	def _init_mup_vector(self):
	"""
	Non learnable per-block scaling vector composed of element-wise
	multipliersapplied to each separate contiguous block of the output
	of the linear projection (in_proj) before further processing
	(gating, convolution, SSM):

	- Z block: [0 : d_ssm] → zxbcdt_multipliers[0]
	- X block: [d_ssm : 2 * d_ssm] → zxbcdt_multipliers[1]
	- B block: [2 * d_ssm : 2 * d_ssm + G * S] → zxbcdt_multipliers[2]
	- C block: [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S]
	→ zxbcdt_multipliers[3]
	- dt block: [2 * d_ssm + 2 * G * S : end] → zxbcdt_multipliers[4]

	where:
	- d_ssm: Dimension of state-space model latent
	- G: Number of groups (n_groups)
	- S: SSM state size per group
	- All indices are divided by tp_size to support tensor parallelism
	"""
	vector_shape = (2 * self.d_ssm + 2 * self.groups_time_state_size +
	self.config.mamba_n_heads) // self.tp_size
	mup_vector = torch.ones(1, vector_shape)
	# Z vector 0 -> d_ssm
	mup_vector[:, :self.d_ssm //
	self.tp_size] *= self.zxbcdt_multipliers[0]
	# X vector d_ssm -> 2 * d_ssm
	mup_vector[:,
	(self.d_ssm //
	self.tp_size):(2 * self.d_ssm //
	self.tp_size)] *= self.zxbcdt_multipliers[1]
	# B vector 2 * d_ssm -> 2 * d_ssm + (n_group * d_state)
	mup_vector[
	:,
	(2 * self.d_ssm) //
	self.tp_size:(2 * self.d_ssm + self.groups_time_state_size) //
	self.tp_size,
	] *= self.zxbcdt_multipliers[2]
	# C vector 2 * d_ssm + (n_group * d_state)
	# -> 2 * d_ssm + 2 * (n_group * d_state)
	mup_vector[
	:,
	(2 * self.d_ssm + self.groups_time_state_size) //
	self.tp_size:(2 * self.d_ssm + 2 * self.groups_time_state_size) //
	self.tp_size,
	] *= self.zxbcdt_multipliers[3]
	# dt vector 2 * d_ssm + 2 * (n_group * d_state)
	# -> 2 * d_ssm + 2 * (n_group * d_state) + n_heads
	mup_vector[
	:,
	(2 * self.d_ssm + 2 * self.groups_time_state_size) //
	self.tp_size:,
	] *= self.zxbcdt_multipliers[4]

	self.register_buffer("mup_vector", mup_vector, persistent=False)

	def forward(
	self,
	hidden_states: torch.Tensor,
	residual: Optional[torch.Tensor],
	mamba_cache_params: MambaCacheParams,
	mamba2_metadata: Mamba2Metadata,
	**kwargs,
	):
	hidden_states = self.mamba(
	hidden_states,
	mamba_cache_params,
	mamba2_metadata=mamba2_metadata,
	mup_vector=self.mup_vector,
	)
	return hidden_states, residual


	class FalconH1AttentionDecoderLayer(nn.Module):

	def __init__(
	self,
	config: FalconH1Config,
	cache_config: Optional[CacheConfig] = None,
	quant_config: Optional[QuantizationConfig] = None,
	prefix: str = "",
	) -> None:
	super().__init__()
	rope_theta = getattr(config, "rope_theta", 1e11)
	rope_scaling = getattr(config, "rope_scaling", None)
	max_position_embeddings = getattr(config, "max_position_embeddings",
	8192)
	self.hidden_size = config.hidden_size
	tp_size = get_tensor_model_parallel_world_size()
	self.total_num_heads = config.num_attention_heads
	assert self.total_num_heads % tp_size == 0
	self.num_heads = self.total_num_heads // tp_size
	self.total_num_kv_heads = config.num_key_value_heads
	if self.total_num_kv_heads >= tp_size:
	# Number of KV heads is greater than TP size, so we partition
	# the KV heads across multiple tensor parallel GPUs.
	assert self.total_num_kv_heads % tp_size == 0
	else:
	# Number of KV heads is less than TP size, so we replicate
	# the KV heads across multiple tensor parallel GPUs.
	assert tp_size % self.total_num_kv_heads == 0
	self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
	self.head_dim = (config.hidden_size // self.total_num_heads if getattr(
	config, "head_dim", None) is None else config.head_dim)
	self.q_size = self.num_heads * self.head_dim
	self.kv_size = self.num_kv_heads * self.head_dim
	self.scaling = self.head_dim**-0.5
	self.rope_theta = rope_theta
	self.max_position_embeddings = max_position_embeddings

	if hasattr(config, "partial_rotary_factor"):
	rotary_dim = self.head_dim * config.partial_rotary_factor
	elif hasattr(config, "attn_rotary_emb"):
	rotary_dim = config.attn_rotary_emb # for backward compatibility
	else:
	rotary_dim = self.head_dim # default

	self.rotary_emb = get_rope(
	head_size=self.head_dim,
	rotary_dim=rotary_dim,
	max_position=max_position_embeddings,
	rope_scaling=rope_scaling,
	base=rope_theta,
	is_neox_style=True,
	dtype=None, # see impl of get_rope
	)

	self.qkv_proj = QKVParallelLinear(
	config.hidden_size,
	self.head_dim,
	self.total_num_heads,
	self.total_num_kv_heads,
	bias=False,
	quant_config=quant_config,
	prefix=f"{prefix}.qkv_proj",
	)
	self.o_proj = RowParallelLinear(
	self.total_num_heads * self.head_dim,
	config.hidden_size,
	bias=False,
	quant_config=quant_config,
	prefix=f"{prefix}.o_proj",
	)

	self.attn = Attention(
	self.num_heads,
	self.head_dim,
	self.scaling,
	num_kv_heads=self.num_kv_heads,
	cache_config=cache_config,
	prefix=f"{prefix}.attn",
	)
	self.key_multiplier = config.key_multiplier

	def self_attention(
	self,
	positions: torch.Tensor,
	hidden_states: torch.Tensor,
	**kwargs,
	) -> torch.Tensor:
	qkv, _ = self.qkv_proj(hidden_states)
	q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
	k = k * self.key_multiplier

	q, k = self.rotary_emb(positions, q, k)
	attn_output = self.attn(q, k, v)
	output, _ = self.o_proj(attn_output)
	return output

	def forward(
	self,
	positions: torch.Tensor,
	hidden_states: torch.Tensor,
	residual: Optional[torch.Tensor],
	**kwargs,
	):
	hidden_states = self.self_attention(
	positions=positions,
	hidden_states=hidden_states,
	)
	return hidden_states, residual


	class FalconH1ParallelHybrid(nn.Module):
	"""
	A hybrid decoder layer for FalconH1 where the input is processed
	in parallel through both the self-attention branch and the SSM (Mamba)
	branch. Their outputs are then summed to produce the final hidden state.

	This layer uses:
	- FalconH1AttentionDecoderLayer for the multi-head self-attention branch.
	- FalconH1SSMDecoderLayer for the state-space (Mamba) branch.
	"""

	def __init__(
	self,
	config: FalconH1Config,
	layer_idx: int,
	cache_config: Optional[CacheConfig] = None,
	quant_config: Optional[QuantizationConfig] = None,
	prefix: str = "",
	) -> None:
	super().__init__()
	# Instantiate the attention branch
	self.self_attn = FalconH1AttentionDecoderLayer(
	config=config,
	cache_config=cache_config,
	quant_config=quant_config,
	prefix=prefix,
	)
	# Instantiate the SSM branch
	self.mamba = FalconH1SSMDecoderLayer(
	config=config,
	cache_config=cache_config,
	quant_config=quant_config,
	)
	self.ssm_out_multiplier = config.ssm_out_multiplier
	self.ssm_in_multiplier = config.ssm_in_multiplier

	self.attention_in_multiplier = config.attention_in_multiplier
	self.attn_out_multiplier = config.attention_out_multiplier

	self.feed_forward = FalconH1MLP(config)

	self.input_layernorm = RMSNorm(config.hidden_size,
	eps=config.rms_norm_eps)
	self.pre_ff_layernorm = RMSNorm(config.hidden_size,
	eps=config.rms_norm_eps)

	def forward(
	self,
	positions: torch.Tensor,
	hidden_states: torch.Tensor,
	mamba_cache_params: MambaCacheParams,
	mamba2_metadata: Mamba2Metadata,
	**kwargs,
	):
	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)
	# Process input through the attention branch.
	# FalconH1AttentionDecoderLayer expects positions, hidden_states,
	# kv_cache, attn_metadata, and residual.
	attn_hidden, _ = self.self_attn(
	positions=positions,
	hidden_states=hidden_states * self.attention_in_multiplier,
	residual=residual,
	**kwargs,
	)

	# Process input through the SSM branch.
	# FalconH1SSMDecoderLayer expects hidden_states, attn_metadata,
	# residual, mamba_cache_params, and sequence_idx.
	ssm_hidden, _ = self.mamba(
	hidden_states=hidden_states * self.ssm_in_multiplier,
	residual=residual,
	mamba_cache_params=mamba_cache_params,
	mamba2_metadata=mamba2_metadata,
	**kwargs,
	)
	# Sum the outputs from both branches.
	# We assume both branches produce outputs of the same
	# dimensionality (config.hidden_size).
	hidden_states = (attn_hidden * self.attn_out_multiplier) + (
	ssm_hidden * self.ssm_out_multiplier)
	hidden_states = hidden_states + residual

	# feed-forward
	residual = hidden_states
	hidden_states = self.pre_ff_layernorm(hidden_states)
	hidden_states = self.feed_forward(hidden_states)
	hidden_states = residual + hidden_states

	return hidden_states


	class FalconH1Model(nn.Module):

	def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
	super().__init__()
	config: FalconH1Config = vllm_config.model_config.hf_config
	cache_config = vllm_config.cache_config
	quant_config = vllm_config.quant_config
	lora_config = vllm_config.lora_config

	self.config = config
	lora_vocab = ((lora_config.lora_extra_vocab_size *
	(lora_config.max_loras or 1)) if lora_config else 0)
	self.vocab_size = config.vocab_size + lora_vocab
	self.org_vocab_size = config.vocab_size
	if get_pp_group().is_first_rank:

	self.embed_tokens = VocabParallelEmbedding(
	self.vocab_size,
	config.hidden_size,
	org_num_embeddings=config.vocab_size,
	)
	self.embedding_multiplier = config.embedding_multiplier
	else:
	self.embed_tokens = PPMissingLayer()
	self.embedding_multiplier = 1.0

	def get_layer(prefix: str):
	layer_idx = int(prefix.rsplit(".", 1)[1])
	layer_class = FalconH1ParallelHybrid
	return layer_class(
	config,
	layer_idx,
	cache_config,
	quant_config=quant_config,
	prefix=prefix,
	)

	self.start_layer, self.end_layer, self.layers = make_layers(
	config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
	self.make_empty_intermediate_tensors = (
	make_empty_intermediate_tensors_factory(
	["hidden_states", "residual"], config.hidden_size))
	if get_pp_group().is_last_rank:
	self.final_layernorm = RMSNorm(config.hidden_size,
	eps=config.rms_norm_eps)
	else:
	self.final_layernorm = PPMissingLayer()

	def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
	return self.embed_tokens(input_ids)

	def forward(
	self,
	input_ids: torch.Tensor,
	positions: torch.Tensor,
	mamba_cache_params: MambaCacheParams,
	intermediate_tensors: Optional[IntermediateTensors] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	) -> torch.Tensor:

	# pass a sequence index tensor, that is required for
	# proper continuous batching computation including
	# chunked prefill
	attn_metadata = get_forward_context().attn_metadata
	mamba2_metadata = prepare_mamba2_metadata(
	chunk_size=self.config.mamba_chunk_size,
	attn_metadata=attn_metadata,
	)
	if get_pp_group().is_first_rank:
	if inputs_embeds is not None:
	hidden_states = inputs_embeds * self.embedding_multiplier
	else:
	hidden_states = (self.get_input_embeddings(input_ids) *
	self.embedding_multiplier)
	else:
	assert intermediate_tensors is not None
	hidden_states = intermediate_tensors["hidden_states"]

	for i in range(self.start_layer, self.end_layer):
	layer = self.layers[i]
	layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i)
	hidden_states = layer(
	positions=positions,
	hidden_states=hidden_states,
	mamba_cache_params=layer_mamba_cache_params,
	mamba2_metadata=mamba2_metadata,
	)
	if not get_pp_group().is_last_rank:
	return IntermediateTensors({
	"hidden_states": hidden_states,
	})
	hidden_states = self.final_layernorm(hidden_states)
	return hidden_states


	class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
	IsHybrid, SupportsV0Only):
	packed_modules_mapping = {
	"qkv_proj": ["q_proj", "k_proj", "v_proj"],
	"gate_up_proj": ["gate_proj", "up_proj"],
	}

	embedding_modules = {
	"embed_tokens": "input_embeddings",
	"lm_head": "output_embeddings",
	}
	embedding_padding_modules = ["lm_head"]

	def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
	config = vllm_config.model_config.hf_config
	self.vllm_config = vllm_config
	self.model_config = vllm_config.model_config
	cache_config = vllm_config.cache_config
	lora_config = vllm_config.lora_config
	scheduler_config = vllm_config.scheduler_config
	assert (not cache_config.enable_prefix_caching
	), "FalconH1 currently does not support prefix caching"

	self.quant_config = vllm_config.quant_config

	super().__init__()
	self.config = config
	self.scheduler_config = scheduler_config
	self.model = FalconH1Model(vllm_config=vllm_config,
	prefix=maybe_prefix(prefix, "model"))
	self.tie_word_embeddings = config.tie_word_embeddings
	self.unpadded_vocab_size = config.vocab_size
	self.mamba_cache: Optional[MambaCacheManager] = None
	if lora_config:
	self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
	if get_pp_group().is_last_rank:
	self.lm_head = ParallelLMHead(
	self.unpadded_vocab_size,
	config.hidden_size,
	org_num_embeddings=config.vocab_size,
	padding_size=(
	DEFAULT_VOCAB_PADDING_SIZE
	# We need bigger padding if using lora for kernel
	# compatibility
	if not lora_config else
	lora_config.lora_vocab_padding_size),
	)
	self.lm_head_multiplier = config.lm_head_multiplier
	if self.tie_word_embeddings:
	self.lm_head = self.lm_head.tie_weights(
	self.model.embed_tokens)
	# Used to track and store by the Mamba cache between steps.

	self.logits_processor = LogitsProcessor(
	self.unpadded_vocab_size,
	config.vocab_size,
	scale=config.lm_head_multiplier,
	)
	else:
	self.lm_head = PPMissingLayer()

	self.make_empty_intermediate_tensors = (
	self.model.make_empty_intermediate_tensors)

	def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
	return self.model.get_input_embeddings(input_ids)

	def forward(
	self,
	input_ids: torch.Tensor,
	positions: torch.Tensor,
	intermediate_tensors: Optional[IntermediateTensors] = None,
	inputs_embeds: Optional[torch.Tensor] = None,
	**kwargs,
	):
	if self.mamba_cache is None:
	self.mamba_cache = MambaCacheManager(
	self.vllm_config,
	self.lm_head.weight.dtype
	if hasattr(self.lm_head, 'weight') else torch.bfloat16,
	self.config.num_hidden_layers,
	*self._get_mamba_cache_shape(),
	)
	mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
	hidden_states = self.model(
	input_ids,
	positions,
	mamba_cache_params,
	intermediate_tensors,
	inputs_embeds,
	)

	return hidden_states

	def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
	return self.mamba_cache.copy_inputs_before_cuda_graphs(
	input_buffers, **kwargs)

	def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
	return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)

	def _get_mamba_cache_shape(
	self) -> tuple[tuple[int, int], tuple[int, int]]:
	world_size = get_tensor_model_parallel_world_size()
	hidden_size = self.config.hidden_size

	conv_state_shape, temporal_state_shape = None, None

	intermediate_size = (int(self.config.mamba_expand *
	hidden_size) if self.config.mamba_d_ssm
	is None else self.config.mamba_d_ssm)

	# if n_groups is not divisible by world_size, need to extend the shards
	# to ensure all groups needed by a head is sharded along with it
	n_groups = self.config.mamba_n_groups + extra_groups_for_head_shards(
	self.config.mamba_n_groups, world_size)

	# - heads and n_groups are TP-ed
	conv_dim = intermediate_size + 2 * n_groups * self.config.mamba_d_state
	conv_state_shape = (
	divide(conv_dim, world_size),
	self.config.mamba_d_conv - 1,
	)

	# These are not TP-ed as they depend on A, dt_bias, D
	# - they are typically small
	# e.g., (h_heads, d_head, d_state) = (128, 64, 128)
	temporal_state_shape = (
	divide(self.config.mamba_n_heads, world_size),
	self.config.mamba_d_head,
	self.config.mamba_d_state,
	)
	return conv_state_shape, temporal_state_shape

	def compute_logits(
	self,
	hidden_states: torch.Tensor,
	sampling_metadata: SamplingMetadata,
	) -> Optional[torch.Tensor]:
	logits = self.logits_processor(self.lm_head, hidden_states,
	sampling_metadata)

	return logits

	def load_weights(self, weights: Iterable[tuple[str,
	torch.Tensor]]) -> set[str]:
	stacked_params_mapping = [
	# (param_name, shard_name, shard_id)
	("qkv_proj", "q_proj", "q"),
	("qkv_proj", "k_proj", "k"),
	("qkv_proj", "v_proj", "v"),
	("gate_up_proj", "gate_proj", 0),
	("gate_up_proj", "up_proj", 1),
	]

	params_dict = dict(self.named_parameters())
	loaded_params: set[str] = set()
	for name, loaded_weight in weights:
	if "rotary_emb.inv_freq" in name:
	continue

	if "A_log" in name:
	name = name.replace("A_log", "A")

	if "mamba" in name:
	name = name.replace("mamba", "mamba.mamba")

	for param_name, weight_name, shard_id in stacked_params_mapping:
	if weight_name not in name:
	continue

	name = name.replace(weight_name, param_name)
	# Skip loading extra bias for GPTQ models.
	if name.endswith(".bias") and name not in params_dict:
	continue
	# Skip layers on other devices.
	if is_pp_missing_parameter(name, self):
	continue
	param = params_dict[name]
	weight_loader = param.weight_loader
	weight_loader(param, loaded_weight, shard_id)
	break
	else:
	# Skip loading extra bias for GPTQ models.
	if name.endswith(".bias") and name not in params_dict:
	continue
	if is_pp_missing_parameter(name, self):
	continue
	if self.tie_word_embeddings and "lm_head" in name:
	continue

	param = params_dict[name]
	weight_loader = getattr(param, "weight_loader",
	default_weight_loader)
	weight_loader(param, loaded_weight)
	loaded_params.add(name)

	if self.tie_word_embeddings:
	loaded_params.add("lm_head.weight")
	return loaded_params