PolarSparsity / HybridTensor /modules /SelectiveMLP.py

Upload folder using huggingface_hub

b3a3b15 verified 8 months ago

22.2 kB

	# python -m HybridTensor.modules.SelectiveMLP --batch_size 8 --index_size 512
	from typing import Optional
	from functools import partial

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch import Tensor
	from torch.distributed import ProcessGroup
	import torch.distributed as dist

	# import fused_dense_cuda # from apex

	import fused_dense_lib as fused_dense_cuda

	from flash_attn.utils.distributed import reduce_scatter, all_reduce
	from einops import rearrange

	# from HybridTensor.modules.MLP import SelectiveMLPFunc
	from HybridTensor.modules.references.fused_dense import ColumnParallelLinear, RowParallelLinear, fused_mlp_func
	from HybridTensor.modules.references.MLP import SelectiveMLPTriton
	from HybridTensor.utils.utils import arg_parser, sparse_index
	from HybridTensor.utils.profiling import cuda_profiler

	# compiles the kernels for the first time, takes time
	from HybridTensor.triton.gather_gemm_col import gather_matmul_col
	from HybridTensor.triton.gather_gemm_row import gather_matmul_row

	# needs to be compiled before running
	from HybridTensor.triton.heuristics.gather_gemm_col_h import gather_matmul_col as gather_matmul_col_h
	from HybridTensor.triton.heuristics.gather_gemm_row_h import gather_matmul_row as gather_matmul_row_h

	# from HybridTensor.triton.cg_safe.gather_gemm_col_cg import gather_matmul_col
	# from HybridTensor.triton.cg_safe.gather_gemm_row_cg import gather_matmul_row


	def SelectiveMLPFunc(x, fc1_w, fc2_w, index_vec, bias1 = None, bias2 = None, activation='relu', use_heuristic=True):
	if use_heuristic:
	out = gather_matmul_col_h(x, fc1_w, index_vec, bias = bias1, activations=activation)
	out = gather_matmul_row_h(out, fc2_w, index_vec, bias = bias2)
	else:
	out = gather_matmul_col(x, fc1_w, index_vec, bias = bias1, activations=activation)
	out = gather_matmul_row(out, fc2_w, index_vec, bias = bias2)
	return out


	# cg safe version
	# def SelectiveMLPFunc(x, fc1_w, fc2_w, index_vec, index_size, bias1 = None, bias2 = None, activation='relu', use_heuristic=True):
	# out = gather_matmul_col(x, fc1_w, index_vec, index_size, bias = bias1, activations=activation)
	# out = gather_matmul_row(out, fc2_w, index_vec, index_size, bias = bias2)
	# return out

	class MLPRouter(nn.Module):
	def __init__(self, embed_dim, low_rank_dim, out_dim, act_th, device=None, dtype=None):
	"""
	Initializes the MHARouter class.

	Args:
	embed_dim (int): Dimensionality of the input embeddings.
	low_rank_dim (int): Dimensionality of the intermediate layer.
	out_dim (int): Number of neurons.
	"""
	super(MLPRouter, self).__init__()
	factory_kwargs = {"device": device, "dtype": dtype}
	self.fc1 = nn.Linear(embed_dim, low_rank_dim, bias=False, **factory_kwargs)
	self.fc2 = nn.Linear(low_rank_dim, out_dim, bias=False, **factory_kwargs)
	self.act_th = act_th
	self.num_neurons = out_dim
	self.largest = self.num_neurons + 1

	def forward(self, x):
	"""
	Forward pass of the MHARouter.

	Args:
	x (torch.Tensor): Input tensor of shape (batch_size, embed_dim).

	Returns:
	torch.Tensor: Output tensor of shape (batch_size, num_heads).
	"""
	x = self.fc1(x)
	x = self.fc2(x)
	return x

	def _select_neurons_topk(self, x, topk=None):
	neurons = self.forward(x)

	neurons_nonzero = torch.nn.ReLU()(neurons)
	_, index_vec = neurons_nonzero.sum(dim=0).topk(topk, dim=0, sorted=False)
	# index_vec, _ = index_vec.sort()
	return index_vec

	def _select_neurons(self, x, th=None):
	'''
	Threshold based selection of neurons, not CG safe
	'''
	if th is None:
	th = self.act_th

	neurons = self.forward(x)
	activated = (neurons > th).sum(dim=0)
	index_vec = activated.nonzero().flatten()
	return index_vec

	def _select_neurons_cuda_safe(self, x, th=None):
	'''
	This function is used with threshold and is used for CG safe version of the code
	'''
	if th is None:
	th = self.act_th
	neurons = self.forward(x)
	activated = (neurons > th).sum(dim=0)

	indices = torch.arange(self.num_neurons, device=activated.device)
	selected = torch.where(activated > th, indices, torch.full_like(indices, self.largest))

	index_vec, _ = torch.sort(selected)
	index_size = ((index_vec < self.largest).sum()).to(torch.int32)

	return index_size, index_vec



	class ParallelMLPRouter(nn.Module):
	"""
	Parallel Sparse Predictor for MHA layer.
	"""

	def __init__(
	self,
	embed_dim,
	low_rank_dim,
	out_dim,
	act_th,
	process_group,
	sequence_parallel=False,
	device=None,
	dtype=None,
	):
	"""
	Initializes the ParallelMHARouter class.

	Args:
	embed_dim (int): Dimensionality of the input embeddings.
	low_rank_dim (int): Dimensionality of the intermediate layer.
	out_dim (int): Output dimensionality (typically number of neurons).
	process_group (torch.distributed.ProcessGroup): Process group for parallelism.
	sequence_parallel (bool, optional): Whether to use sequence parallelism. Defaults to False.
	device (torch.device, optional): Device to run the module on. Defaults to None.
	dtype (torch.dtype, optional): Data type of the module parameters. Defaults to None.
	"""
	super(ParallelMLPRouter, self).__init__()
	assert process_group is not None, "ParallelMHARouter requires a process group."

	factory_kwargs = {"device": device, "dtype": dtype}
	self.process_group = process_group
	self.embed_dim = embed_dim
	self.act_th = act_th

	self.fc1 = nn.Linear(
	embed_dim, low_rank_dim, bias=False, **factory_kwargs
	)
	self.fc2 = ColumnParallelLinear(
	low_rank_dim,
	out_dim,
	process_group,
	bias=False,
	sequence_parallel=sequence_parallel,
	**factory_kwargs,
	)

	# def _select_neurons(self, neurons, th=None):
	# if th is None:
	# th = self.act_th
	# activated = (neurons > th).sum(dim=0)
	# index_vec = activated.nonzero().flatten()
	# return index_vec

	def forward(self, x):
	"""
	Forward pass of the ParallelMHARouter.

	Args:
	x (torch.Tensor): Input tensor of shape (batch_size, seq_len, embed_dim).

	Returns:
	torch.Tensor: Output tensor of shape (batch_size, seq_len, out_dim).
	"""
	x = self.fc1(x)
	x = self.fc2(x)
	return x

	def _select_neurons(self, x, th=None):
	if th is None:
	th = self.act_th

	neurons = self.forward(x)
	activated = (neurons > th).sum(dim=0)
	index_vec = activated.nonzero().flatten()
	return index_vec

	def _select_neurons_topk(self, x, topk=None):
	neurons = self.forward(x)

	neurons_nonzero = torch.nn.ReLU()(neurons) #.squeeze(1)
	# print(f"neurons_nonzero shape: {neurons_nonzero.shape}")
	# print(f"Top k neurons: {topk}")
	_, index_vec = neurons_nonzero.sum(dim=0).topk(topk, dim=0, sorted=False)
	# index_vec, _ = index_vec.sort()
	return index_vec

	class SelectiveMLP(nn.Module):
	def __init__(
	self,
	in_features,
	hidden_features=None,
	out_features=None,
	activation='relu',
	layer_idx=None,
	bias1=True,
	bias2=True,
	return_residual=False,
	checkpoint_lvl=0,
	use_heuristic=True,
	device=None,
	dtype=None,
	):
	factory_kwargs = {"device": device, "dtype": dtype}
	super().__init__()
	out_features = out_features if out_features is not None else in_features
	hidden_features = hidden_features if hidden_features is not None else in_features * 4
	self.return_residual = return_residual
	self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
	self.activation = activation
	self.activation_fn = nn.ReLU()
	self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
	# self.fc2_weight_t = self.fc2.weight.t().contiguous()
	self.fc2_weight_t = None
	self.use_heuristic = use_heuristic

	def _init_weights(self):
	# if weights are updated, we need to update the transpose
	self.fc2_weight_t = self.fc2.weight.t().contiguous()

	def forward(self, x, index_vec=None, index_size=None):

	if index_vec is not None:
	# sparse forward,

	# update on first run
	if self.fc2_weight_t is None:
	self.fc2_weight_t = self.fc2.weight.t().contiguous()

	# Remove the original parameter to free memory.
	self.fc2.weight = None
	del self.fc2._parameters['weight']

	x = x.view(-1, x.size(-1))
	# x = x.squeeze(1)
	y = SelectiveMLPFunc(x = x, fc1_w = self.fc1.weight,
	fc2_w = self.fc2_weight_t, index_vec = index_vec,
	bias1 = self.fc1.bias, bias2 = self.fc2.bias,
	activation=self.activation, use_heuristic=self.use_heuristic)

	else:
	# dense forward

	y = self.fc1(x)
	y = self.activation_fn(y)

	if self.fc2_weight_t is not None:
	y = torch.matmul(y, self.fc2_weight_t)
	else:
	y = self.fc2(y)

	return y if not self.return_residual else (y, x)

	class ParallelSelectiveMLP(nn.Module):
	def __init__(
	self,
	in_features,
	hidden_features,
	out_features=None,
	activation="relu",
	layer_idx=None,
	process_group: ProcessGroup = None,
	bias1=True,
	bias2=True,
	return_residual=False,
	sequence_parallel=False,
	use_heuristic=True,
	checkpoint_lvl=0,
	heuristic="auto",
	device=None,
	dtype=None,
	):
	"""
	process_group is required. We're doing Tensor Parallel with sequence parallelism:
	we do an all_gather of x before doing the matmul, gelu, then matmul.
	Finally we do a reduce_scatter of the output.

	checkpoint_lvl (increasing lvl means slower but more memory saving):
	0: no recomputation in the bwd
	1: recompute gelu_out in the bwd
	2: recompute pre_act and gelu_out in the bwd
	heuristic:
	-1: don't fuse gemm + gelu (separate kernel)
	0..4: use this heuristic for the algo section in the fused gemm + gelu
	'auto': heuristic will be picked automatically:
	For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
	For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
	"""
	assert checkpoint_lvl in [0, 1, 2]
	assert activation in ["gelu_approx", "relu"]
	assert process_group is not None
	# assert sp_kwargs != None, "sparse predictor parameters are not passed in."
	factory_kwargs = {"device": device, "dtype": dtype}
	super().__init__()
	if out_features is None:
	out_features = in_features
	self.activation = activation
	self.process_group = process_group
	self.sequence_parallel = sequence_parallel
	self.checkpoint_lvl = checkpoint_lvl
	self.heuristic = heuristic
	self.fc1 = ColumnParallelLinear(
	in_features, hidden_features, process_group, bias=bias1, **factory_kwargs
	)
	self.fc2 = RowParallelLinear(
	hidden_features, out_features, process_group, bias=bias2, **factory_kwargs
	)
	self.layer_idx = layer_idx

	self.fc2_weight_t = self.register_buffer("fc2_weigth_t", None)
	self.return_residual = return_residual
	self.fc2_weight_t = None
	self.use_heuristic = use_heuristic
	self.reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce

	# self._init_weights()

	def _init_weights(self):
	# ffn2 weights needs to be in row major format to select from rows
	self.fc2_weight_t = self.fc2.weight.t().contiguous()

	def forward(self, x, residual = None, index_vec = None):

	# do_token_generation = x.size(1) == 1
	# index_vec = None
	# with torch.cuda.stream(self.curr_stream):
	if index_vec is not None:
	# assert x.size(1) == 1
	if self.fc2_weight_t is None:
	self.fc2_weight_t = self.fc2.weight.t().contiguous()

	x = x.view(-1, x.size(-1))
	# x = rearrange(x, "b 1 d -> b d") # slightly more expensive to use rearrange

	out = SelectiveMLPFunc(x = x, fc1_w = self.fc1.weight,
	fc2_w = self.fc2_weight_t, index_vec = index_vec,
	bias1 = self.fc1.bias, bias2 = self.fc2.bias,
	activation=self.activation, use_heuristic=self.use_heuristic)
	# out = rearrange(out, "b d -> b 1 d")
	# out = out.view(-1, 1, out.size(-1))

	else: # normal mlp
	if self.heuristic == "auto":
	dtype = (
	x.dtype
	if not torch.is_autocast_enabled()
	else torch.get_autocast_gpu_dtype()
	)
	if self.activation == "gelu_approx":
	cuda_ver = tuple(map(int, torch.version.cuda.split(".")))
	heuristic = (
	0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
	)
	else:
	heuristic = 0
	else:
	heuristic = self.heuristic
	out = fused_mlp_func(
	x,
	self.fc1.weight,
	self.fc2.weight,
	self.fc1.bias,
	self.fc2.bias,
	activation=self.activation,
	save_pre_act=self.training,
	checkpoint_lvl=self.checkpoint_lvl,
	heuristic=heuristic,
	process_group=self.process_group,
	sequence_parallel=self.sequence_parallel,
	)

	if self.process_group.size() > 1:
	# out = self.reduce_fn(out, self.process_group) # has some overhead,
	dist.all_reduce(out, op=dist.ReduceOp.SUM, group=self.process_group)

	return out if not self.return_residual else (out, x)
	# return out

	def sp_forward(self, x, residual = None, index_vec = None):
	if self.heuristic == "auto":
	dtype = (
	x.dtype
	if not torch.is_autocast_enabled()
	else torch.get_autocast_gpu_dtype()
	)
	if self.activation == "gelu_approx":
	cuda_ver = tuple(map(int, torch.version.cuda.split(".")))
	heuristic = (
	0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
	)
	else:
	heuristic = 0
	else:
	heuristic = self.heuristic
	curr_stream = torch.cuda.current_stream()
	do_token_generation = x.size(1) == 1
	# mlp_logit = None

	# with torch.cuda.stream(self.curr_stream):
	if index_vec != None:
	assert x.size(1) == 1

	if self.fc2_weight_t is None:
	self.fc2_weight_t = self.fc2.weight.t().contiguous()

	out = SelectiveMLPFunc(
	rearrange(x, "b 1 d -> b d"),
	self.fc1.weight,
	self.fc2_weight_t,
	index_vec,
	self.fc1.bias,
	self.fc2.bias,
	activation=self.activation,
	)
	out = rearrange(out, "b d -> b 1 d")
	else:
	out = fused_mlp_func(
	x,
	self.fc1.weight,
	self.fc2.weight,
	self.fc1.bias,
	self.fc2.bias,
	activation=self.activation,
	save_pre_act=self.training,
	checkpoint_lvl=self.checkpoint_lvl,
	heuristic=heuristic,
	process_group=self.process_group,
	sequence_parallel=self.sequence_parallel,
	)


	reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
	if self.sp_router:
	curr_stream.record_event(self.event_mlp)

	# handle = torch.distributed.all_reduce(out, op=torch.distributed.ReduceOp.SUM, group=self.process_group, async_op=True)
	out = reduce_fn(out, self.process_group)


	if self.sp_router:
	with torch.cuda.stream(self.sp_stream):
	self.sp_stream.wait_event(self.event_mlp)
	if do_token_generation:
	mlp_logit = self.sp(rearrange(residual, "b 1 d -> b d"))
	self.sp_stream.record_event(self.event_mlp_sp)

	# check this again, we might not have to synchronize here, we can synchronize in the next layer
	curr_stream.wait_event(self.event_mlp_sp)

	return out

	class SimpleMLP(nn.Module):
	def __init__(self, in_features, hidden_features, out_features, bias=False, activation="relu"):
	super().__init__()
	self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
	self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
	self.activation = activation

	def forward(self, x):
	x = F.relu(self.fc1(x))
	x = self.fc2(x)
	return x

	if __name__ == "__main__":
	args = arg_parser()

	bias = True if args.bias > 0 else False
	x = torch.randn(args.batch_size, args.in_features, device="cuda", dtype=torch.float16)
	index_vec, _ = sparse_index(args.index_size, args.in_features*4)

	'''
	selective_mlp = SelectiveMLPTriton(args.in_features, args.hidden_features, bias=bias, device="cuda", dtype=torch.float16, activation="relu")

	out, mlp_time = cuda_profiler(selective_mlp, x, index_vec)

	out_col, col_time = cuda_profiler(gather_matmul_col, x, selective_mlp.fc1_w, index_vec, activations=selective_mlp.activation)
	out_row, row_time = cuda_profiler(gather_matmul_row, out_col, selective_mlp.fc2_w, index_vec)
	sum_time = col_time + row_time

	print(f"Index size {args.index_size}, Activated {args.index_size/(args.in_features * 4)*100}% neurons")

	print(f"Gather Col Time: {col_time} ms")
	print(f"Gather Row Time: {row_time} ms")
	# print(f"Sum Time: {sum_time} ms")

	print(f"SelectiveMLP Time: {mlp_time} ms")
	'''

	in_features = args.in_features
	hidden_features = in_features * 4
	out_features = in_features
	device = torch.device("cuda")

	model = SelectiveMLP(
	in_features, hidden_features, out_features, device=device, dtype=torch.float16, activation="relu", use_heuristic=True
	).to(device)

	router = MLPRouter(in_features, 1024, hidden_features, act_th = 0.5, device=device, dtype=torch.float16).to(device)

	# Warm-up GPU
	def warmup():
	for _ in range(10):
	_ = model(x, index_vec)
	_ = model(x, None)
	_ = router._select_neurons_topk(x, args.index_size)

	warmup()

	# Measure SelectiveMLPFunc speed
	_, router_time = cuda_profiler(router._select_neurons_topk, x, args.index_size)
	_, selective_time = cuda_profiler(model, x, index_vec)
	# Measure dense forward speed
	_, dense_time = cuda_profiler(model, x, None)

	print(f"Router time per run: {router_time:.6f} ms")
	print(f"SelectiveMLPFunc time per run: {selective_time:.6f} ms")
	print(f"Dense forward time per run: {dense_time:.6f} ms")
	print(f"Speedup: {dense_time / selective_time:.2f}x")
	router_selective_time = router_time + selective_time
	print(f"Router + SelectiveMLPFunc time per run: {router_selective_time:.6f} ms")
	print(f"Speedup: {dense_time / router_selective_time:.2f}x")
	############################################
	# CUDA Graph capture tests for the MLP model
	############################################
	print("\n=== CUDA Graph Tests ===")
	# --- Selective forward (sparse mode) ---
	print("Testing CUDA Graph for Selective forward (with index_vec)...")
	static_x = x.clone()
	static_index_vec = index_vec.clone()
	# Warm-up run to allocate memory
	static_out_sel = model(static_x, index_vec=static_index_vec)
	torch.cuda.synchronize()

	# Capture on a non-default stream
	capture_stream = torch.cuda.Stream()
	with torch.cuda.stream(capture_stream):
	g_sel = torch.cuda.CUDAGraph()
	g_sel.capture_begin()
	static_out_sel = model(static_x, index_vec=static_index_vec)
	g_sel.capture_end()
	torch.cuda.synchronize()

	# Replay and check accuracy
	g_sel.replay()
	torch.cuda.synchronize()
	cuda_sel_out = static_out_sel.clone()
	regular_sel_out = model(x, index_vec=index_vec)
	if torch.allclose(cuda_sel_out, regular_sel_out, atol=1e-3):
	print("Selective forward CUDA Graph output matches regular output")
	else:
	print("Selective forward CUDA Graph output does NOT match regular output")

	def replay_sel():
	g_sel.replay()
	_, selective_time_cuda = cuda_profiler(replay_sel)
	print(f"Selective forward CUDA Graph time per run: {selective_time_cuda:.6f} ms")