Upload model.py

abd23e5 verified 11 months ago

46.5 kB

	"""

	Based on the tinyllama implementation: https://github.com/jzhang38/TinyLlama

	"""


	import math, random
	import numpy as np
	from typing import Any, List, Optional, Tuple
	from typing_extensions import Self


	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	from lightning_utilities.core.imports import RequirementCache
	FlashAttention2Available = RequirementCache("flash-attn>=2.0.0.post1")

	from flash_attn import flash_attn_func
	from xformers.ops import SwiGLU
	from einops import rearrange


	from transformers import PreTrainedModel
	from .model_config import YingLongConfig




	class Tokenizer(torch.nn.Module):
	def __init__(self, config: YingLongConfig, args,*kwargs) -> None:
	super().__init__()

	self.config = config
	self.tokenizer = nn.Linear(config.patch_size,self.config.n_embd)

	self.patch_size = config.patch_size
	self.mask0 = nn.Linear(1,config.n_embd)

	self.register_buffer('mask_token', torch.zeros(1000))
	if self.config.haar_trans:

	self.register_buffer('haar_transform',torch.Tensor(haarMatrix(self.config.patch_size,normalized = self.config.haar_trans_norm)))



	def forward(self,x,
	future_token = 0,
	prev_token = 0,
	factor = 0.2,
	sequential = False,
	args, *kwargs):


	b = x.shape[0]

	x_raw = rearrange(x, "b (l c) -> b l c", c = self.patch_size)
	x_raw_0 = rearrange(x, "b (l c) -> b l c", c = self.patch_size).detach().clone()

	if future_token == 0:
	if not sequential:
	masks = torch.randperm(x_raw.shape[1])
	unmasks,masks = masks[:int(x_raw.shape[1]factor)],masks[int(x_raw.shape[1]factor):]
	else:
	masks = [_ for _ in range(x_raw.shape[1])]
	factor = np.random.rand()*0.6 + 0.2
	unmasks,masks = masks[:int(x_raw.shape[1]factor)],masks[int(x_raw.shape[1]factor):]



	x_raw_remains = x_raw[:,unmasks,:]

	mean = x_raw_remains.mean(dim = (-2,-1),keepdims = True)
	std = x_raw_remains.std(dim = (-2,-1),keepdims = True)
	x_raw = (x_raw - mean)/ (std + 1e-4)


	if self.config.haar_trans:
	x_featured = torch.einsum('blc,ac->bla',x_raw,self.haar_transform)
	x_featured = self.tokenizer(x_featured)
	else:
	x_featured = self.tokenizer(x_raw)


	x_featured[:,masks,:] = self.mask0(self.mask_token[0].unsqueeze(0))



	else:

	factor = 1
	more_rows = future_token // self.patch_size + 1
	prev_more_rows = prev_token // self.patch_size + 1

	mean = x_raw[:,prev_more_rows:-more_rows,:].mean(dim = (-2,-1),keepdims = True)
	std = x_raw[:,prev_more_rows:-more_rows,:].std(dim = (-2,-1),keepdims = True)
	x_raw = (x_raw - mean)/ (std + 1e-4)


	if self.config.haar_trans:
	x_featured = torch.einsum('blc,ac->bla',x_raw,self.haar_transform)
	x_featured = self.tokenizer(x_featured)
	else:
	x_featured = self.tokenizer(x_raw)


	masks = [jj for jj in range(x_featured.shape[1])]
	masks = masks[-more_rows:]

	x_featured[:,-more_rows:] = self.mask0(self.mask_token[:len(masks)].unsqueeze(-1)).repeat(x_featured.shape[0],1,1)
	x_featured[:,:prev_more_rows] = self.mask0(self.mask_token[:prev_more_rows].unsqueeze(-1)).repeat(x_featured.shape[0],1,1)


	return x_featured, x_raw_0, masks, mean, std, x_raw



	class model_tmp(PreTrainedModel):
	config_class = YingLongConfig
	base_model_prefix = "model"



	def _init_weights(self, module: nn.Module) -> None:
	if isinstance(module, nn.Embedding):
	torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
	elif isinstance(module, nn.Linear):
	torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
	if module.bias is not None:
	torch.nn.init.zeros_(module.bias)
	for name, p in module.named_parameters():
	if (name == "proj.weight" and isinstance(module, LLaMAMLP)) or (name == "w3.weight" and isinstance(module, SwiGLU) or (name=="proj.weight" and isinstance(module, BidirectedlSelfAttention))):
	nn.init.normal_(p, mean=0.0, std=1 / math.sqrt(self.config.n_embd) / self.config.n_layer)







	class GPT(model_tmp):
	def __init__(self, config: YingLongConfig, args,*kwargs) -> None:


	super().__init__(config)

	self.config = config
	self.patch_size = config.patch_size
	self.unet = config.unet


	if self.config._norm_class == "RMSNorm":

	self.config.norm_class = RMSNorm
	elif self.config._norm_class == "FusedRMSNorm":
	self.config.norm_class = FusedRMSNorm
	elif self.config._norm_class == 'BatchNorm':
	self.config.norm_class = iBatchNorm


	if self.config._mlp_class == "GptNeoxMLP":
	self.config.mlp_class = GptNeoxMLP
	elif self.config._mlp_class == "LLaMAMLP":
	self.config.mlp_class = LLaMAMLP




	self.tokenizer = Tokenizer(config)


	self.lm_head = nn.Linear(config.n_embd, 99*self.patch_size)


	self.quantitleLoss = quantitleLoss(99,patch_size = self.patch_size)



	if self.unet:
	assert config.n_layer%2 == 0
	self.unet_projection = nn.ModuleList(nn.Sequential(nn.Linear(config.n_embd*2,config.n_embd),
	config.norm_class(config.n_embd, eps=config.norm_eps),
	)
	for _ in range(config.n_layer//2)
	)
	self.unet_merge = nn.ModuleList(nn.Sequential(nn.Linear(config.n_embd*2,config.n_embd),
	config.norm_class(config.n_embd, eps=config.norm_eps),
	)
	for _ in range(config.n_layer//2)
	)



	self.transformer = nn.ModuleDict(dict(h = nn.ModuleList(Block(config)
	for _ in range(config.n_layer))
	)
	)



	self.rope_cache = None



	def forward(
	self, idx: torch.Tensor,
	future_token: int = 0,
	prev_token: int = 0,
	args,*kwargs,
	) -> torch.Tensor:

	if future_token > 0:
	more_rows = future_token // self.patch_size + 1
	idx = torch.cat((idx,torch.zeros(idx.shape[0],more_rows*self.patch_size).to(idx.device)),dim = -1).bfloat16()
	if prev_token > 0:
	more_rows = prev_token // self.patch_size + 1
	idx = torch.cat((torch.zeros(idx.shape[0],more_rows*self.patch_size).to(idx.device),idx),dim = -1).bfloat16()

	B, T = idx.size()



	block_size = self.config.block_size
	max_seq_length = T

	assert max_seq_length <= block_size, f"Cannot attend to {max_seq_length}, block size is only {block_size}"


	self.rope_cache = self.build_rope_cache(idx)
	cos, sin = self.rope_cache

	cos = cos[:max(T,1024)]
	sin = sin[:max(T,1024)]




	x,x_raw,masks,mean,std,_ = self.tokenizer(idx, future_token =future_token,prev_token = prev_token)



	if self.unet:
	skips = []




	for block_idx in range(len( self.transformer.h)):


	block = self.transformer.h[block_idx]

	if self.unet and block_idx >=len(self.transformer.h) //2:
	x = self.unet_projection[block_idx - len(self.transformer.h) //2](torch.cat((skips.pop(),x),dim = -1))

	x = block(x, (cos, sin), max_seq_length)

	if self.unet and block_idx <len(self.transformer.h) //2:
	skips.append(x)
	x_delay = torch.cat((x[:,0,:].unsqueeze(1),x[:,:-1,:]),dim = 1)
	x = self.unet_merge[block_idx](torch.cat((x_delay,x),dim = -1))




	res = self.lm_head(x)



	res = rearrange(res,'b c (l1 l2) -> b c l1 l2', l2 = 99)



	if self.config.haar_trans_inv:
	res = torch.einsum('bcal,ad->bcdl',res,self.tokenizer.haar_transform)
	if self.config.haar_trans_norm == "backward":
	res = res / np.sqrt(res.shape[-2])
	elif self.config.haar_trans_norm == "forward":
	res = res * np.sqrt(res.shape[-2])





	res = res * (std.unsqueeze(-1) + 1e-4) + mean.unsqueeze(-1)




	if future_token == 0:
	return res[:,masks,:,:], x_raw[:,masks,:]
	else:
	return res[:,masks,:,:]

	def generate(self,args,*kwargs):
	res = self.forward(args,*kwargs)
	res = rearrange(res, 'b l c d -> b (l c) d')
	return res[:,:kwargs['future_token'],:]



	@classmethod
	def from_name(cls, name: str, **kwargs: Any) -> Self:
	return cls(Config.from_name(name, **kwargs))

	def build_rope_cache(self, idx: torch.Tensor) :
	return build_rope_cache(
	seq_len=self.config.block_size,
	n_elem=int(self.config.rotary_percentage * self.config.head_size),
	dtype=torch.bfloat16,
	device=idx.device,
	base = self.config.rope_base,
	condense_ratio=self.config.condense_ratio,
	)


	class Block(nn.Module):
	def __init__(self, config:YingLongConfig) -> None:
	super().__init__()
	self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps)
	self.attn = BidirectedlSelfAttention(config)
	if not config.shared_attention_norm:
	self.norm_2 = config.norm_class(config.n_embd, eps=config.norm_eps)
	self.mlp = config.mlp_class(config)
	self.config = config
	def forward(
	self,
	x: torch.Tensor,
	rope: Optional[Tuple[torch.Tensor, torch.Tensor]],
	max_seq_length: int,
	mask: Optional[torch.Tensor] = None,
	input_pos: Optional[torch.Tensor] = None,
	) -> torch.Tensor:

	n_1 = self.norm_1(x)
	h = self.attn(n_1, rope, max_seq_length, mask, input_pos)
	if self.config.parallel_residual:
	n_2 = n_1 if self.config.shared_attention_norm else self.norm_2(x)
	x = x + h + self.mlp(n_2)
	else:
	if self.config.shared_attention_norm:
	raise NotImplementedError(
	"No checkpoint amongst the ones we support uses this configuration"
	" (non-parallel residual and shared attention norm)."
	)

	x = x + h
	x = x + self.mlp(self.norm_2(x))
	return x


	class BidirectedlSelfAttention(nn.Module):
	def __init__(self, config:YingLongConfig) -> None:
	super().__init__()
	shape = (config.n_head + 2 * config.n_query_groups) * config.head_size
	self.attn = nn.Linear(config.n_embd, shape, bias=config.bias)
	self.proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
	self.config = config

	def forward(
	self,
	x: torch.Tensor,
	rope: Tuple[torch.Tensor, torch.Tensor],
	max_seq_length: int,
	mask: Optional[torch.Tensor] = None,
	input_pos: Optional[torch.Tensor] = None,
	) -> torch.Tensor:


	B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

	qkv = self.attn(x)

	# assemble into a number of query groups to support MHA, MQA and GQA together (see `config.n_query_groups`)
	q_per_kv = self.config.n_head // self.config.n_query_groups
	total_qkv = q_per_kv + 2 # each group has 1+ queries, 1 key, and 1 value
	qkv = qkv.view(B, T, self.config.n_query_groups, total_qkv, self.config.head_size) # (B, T, n_query_groups, total_qkv, hs)


	# split batched computation into three
	q, k, v = qkv.split((q_per_kv, 1, 1), dim=-2)

	q = q.reshape(B, T, -1, self.config.head_size) # (B, T, nh_q, hs)
	k = k.reshape(B, T, -1, self.config.head_size)
	v = v.reshape(B, T, -1, self.config.head_size)

	cos, sin = rope

	q = apply_rotary_emb_func(q, cos, sin, False, True)
	k = apply_rotary_emb_func(k, cos, sin, False, True)


	y = self.scaled_dot_product_attention(q, k, v, mask=mask)

	y = y.reshape(B, T, C) # re-assemble all head outputs side by side

	# output projection
	y = self.proj(y)

	return y



	def scaled_dot_product_attention(
	self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, mask: Optional[torch.Tensor] = None
	):
	scale = 1.0 / math.sqrt(self.config.head_size)

	if (
	FlashAttention2Available
	and mask is None
	and q.device.type == "cuda"
	and q.dtype in (torch.float16, torch.bfloat16)
	):
	from flash_attn import flash_attn_func

	return flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=scale, causal=False)
	q = q.transpose(1, 2)
	k = k.transpose(1, 2)
	v = v.transpose(1, 2)
	if q.size() != k.size():
	k = k.repeat_interleave(q.shape[1]//k.shape[1], dim=1)
	v = v.repeat_interleave(q.shape[1]//v.shape[1], dim=1)
	y = torch.nn.functional.scaled_dot_product_attention(
	q, k, v, attn_mask=mask, dropout_p=0.0, scale=scale, is_causal=False
	)
	return y.transpose(1, 2)






	class quantitleLoss(torch.nn.Module):
	def __init__(self,
	qSize = 99,
	patch_size = 16,
	args,*kwargs):

	super().__init__()
	self.qSize = qSize
	self.patch_size = patch_size


	q = np.array([i+1 for i in range(self.qSize)])
	q = q / (self.qSize + 1)
	q = q.reshape((1,1,-1))

	q_variance = q*(1-q)

	self.register_buffer('q', torch.tensor(q))
	self.register_buffer('q_variance', torch.tensor(q_variance))


	def forward(self, input: torch.Tensor, target: torch.Tensor,rel_loss = False):



	target = target.unsqueeze(-1)
	input = input[:,:target.shape[1],:,:]


	posPart = input - target
	negPart = -posPart

	raw_loss = torch.maximum(self.q * negPart, (1-self.q) * posPart)

	target_absmean = torch.mean(target.abs(),dim = (1,2),keepdims = True)
	raw_loss = raw_loss / torch.sqrt(self.q_variance) / (target_absmean + 1e-4)

	return torch.mean(raw_loss)


	def haarMatrix_unnormalized(n):

	n = 2**np.ceil(np.log2(n))
	if n > 2:
	h = haarMatrix(n / 2)
	else:
	return np.array([[1, 1], [1, -1]])
	h_n = np.kron(h, [1, 1])
	h_i = np.kron(np.eye(len(h)), [1, -1])
	h = np.vstack((h_n, h_i))
	return h

	def haarMatrix(n,normalized = 'ortho'):
	h = haarMatrix_unnormalized(n)
	scaler = np.diag(1/np.sqrt(np.diag(h@h.transpose())))
	if normalized == 'ortho':
	return scaler @ h
	elif normalized == 'forward':
	return scaler @ h/ np.sqrt(n)

	else:
	return scaler @ h * np.sqrt(n)



	class GptNeoxMLP(nn.Module):
	def __init__(self, config:YingLongConfig) -> None:
	super().__init__()
	self.fc = nn.Linear(config.n_embd, config.intermediate_size, bias=config.bias)
	self.proj = nn.Linear(config.intermediate_size, config.n_embd, bias=config.bias)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = self.fc(x)
	x = torch.nn.functional.gelu(x)
	return self.proj(x)


	class LLaMAMLP(nn.Module):
	def __init__(self, config:YingLongConfig) -> None:
	super().__init__()

	self.swiglu = SwiGLU(config.n_embd,config.intermediate_size, bias=False, _pack_weights=False)
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.swiglu(x)


	def build_rope_cache(
	seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000, condense_ratio: int = 1
	) -> Tuple[torch.Tensor,torch.Tensor]:
	"""Enhanced Transformer with Rotary Position Embedding.

	Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
	transformers/rope/__init__.py. MIT License:
	https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
	"""
	# $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
	theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device) / n_elem))

	# Create position indexes `[0, 1, ..., seq_len - 1]`
	seq_idx = torch.arange(seq_len, device=device) / condense_ratio

	# Calculate the product of position index and $\theta_i$
	idx_theta = torch.outer(seq_idx, theta)

	cos, sin = torch.cos(idx_theta), torch.sin(idx_theta)

	# added by peiyuan to ensure same data type with q, k, to use fused rotary embedding
	if dtype == torch.bfloat16:
	return cos.bfloat16(), sin.bfloat16()
	# this is to mimic the behaviour of complex32, else we will get different results
	if dtype in (torch.float16, torch.bfloat16, torch.int8):
	return cos.half(), sin.half()
	return cos, sin


	def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
	head_size = x.size(-1)
	x1 = x[..., : head_size // 2] # (B, nh, T, hs/2)
	x2 = x[..., head_size // 2 :] # (B, nh, T, hs/2)
	rotated = torch.cat((-x2, x1), dim=-1) # (B, nh, T, hs)
	roped = (x * cos) + (rotated * sin)
	return roped.type_as(x)







	######################################
	#layernorm
	######################################


	import torch
	# Copyright (c) 2022, Tri Dao.
	# Adapted from https://github.com/NVIDIA/apex/blob/master/apex/contrib/layer_norm/layer_norm.py AND https://github.com/Dao-AILab/flash-attention/blob/7a983df74215e035e566e37125b0a71e3618f39d/flash_attn/ops/layer_norm.py#L16

	import dropout_layer_norm
	import torch
	from torch.nn import init


	def maybe_align(x, alignment_in_bytes=16):
	"""Assume that x already has last dim divisible by alignment_in_bytes"""
	# TD [2023-07-04] I'm not 100% sure that clone will align the memory
	# https://discuss.pytorch.org/t/how-to-ensure-that-tensor-data-ptr-is-aligned-to-16-bytes/183440
	return x if x.data_ptr() % alignment_in_bytes == 0 else x.clone()


	def _dropout_add_layer_norm_forward(
	x0,
	residual,
	gamma,
	beta,
	rowscale,
	colscale,
	dropout_p,
	epsilon,
	residual_in_fp32=False,
	is_rms_norm=False,
	):
	"""Assume that arguments are contiguous and aligned to 16 bytes"""
	hidden_size = gamma.numel()
	x0mat = x0.view((-1, hidden_size))
	residualmat = residual.view((-1, hidden_size)) if residual is not None else None
	rowscale = rowscale.view(-1) if rowscale is not None else None
	zmat, xmat, dmask, mu, rsigma = dropout_layer_norm.dropout_add_ln_fwd(
	x0mat,
	residualmat,
	gamma,
	beta,
	rowscale,
	colscale,
	None,
	None,
	dropout_p,
	epsilon,
	1.0,
	0,
	None,
	residual_in_fp32,
	is_rms_norm,
	)
	# dmask is None if dropout_p == 0.0
	# xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
	return zmat, xmat if xmat is not None else x0mat, dmask, mu, rsigma


	def _dropout_add_layer_norm_backward(
	dz,
	dx,
	x,
	x0,
	dmask,
	mu,
	rsigma,
	gamma,
	rowscale,
	colscale,
	dropout_p,
	has_residual,
	is_rms_norm=False,
	):
	"""Assume that arguments are contiguous and aligned to 16 bytes
	dx == None means that it was a post-norm architecture
	(x = drop(x0) + residual was not returned in the fwd).
	x0 must not be None if we have colscale.
	"""
	hidden_size = gamma.numel()
	xmat = x.view((-1, hidden_size))
	dzmat = dz.view(xmat.shape)
	dxmat = dx.view(xmat.shape) if dx is not None else None
	x0mat = x0.view((-1, hidden_size)) if x0 is not None else None
	rowscale = rowscale.view(-1) if rowscale is not None else None
	if colscale is not None:
	assert x0 is not None, "x0 is required to compute the gradient of colscale"
	dx0mat, dresidualmat, dgamma, dbeta, _, _, *rest = dropout_layer_norm.dropout_add_ln_bwd(
	dzmat,
	dxmat,
	xmat,
	x0mat,
	dmask,
	mu,
	rsigma,
	gamma,
	rowscale,
	colscale,
	None,
	None,
	dropout_p,
	1.0,
	0,
	has_residual,
	is_rms_norm,
	)
	# dresidualmat is None if not has_residual
	if colscale is None:
	return dx0mat, dresidualmat, dgamma, dbeta
	else:
	dcolscale = rest[0]
	return dx0mat, dresidualmat, dgamma, dbeta, dcolscale


	def _dropout_add_layer_norm_subset_forward(
	x0,
	residual,
	gamma,
	beta,
	colscale,
	x0_subset,
	out_subset,
	dropout_p,
	epsilon,
	rowscale_const,
	out_numrows,
	residual_in_fp32=False,
	is_rms_norm=False,
	):
	"""Assume that arguments are contiguous and aligned to 16 bytes"""
	hidden_size = gamma.numel()
	x0mat = x0.view((-1, hidden_size))
	residualmat = residual.view((-1, hidden_size)) if residual is not None else None
	x0_subset = x0_subset.view(-1) if x0_subset is not None else None
	out_subset = out_subset.view(-1) if out_subset is not None else None
	zmat, xmat, dmask, mu, rsigma = dropout_layer_norm.dropout_add_ln_fwd(
	x0mat,
	residualmat,
	gamma,
	beta,
	None,
	colscale,
	x0_subset,
	out_subset,
	dropout_p,
	epsilon,
	rowscale_const,
	out_numrows,
	None,
	residual_in_fp32,
	is_rms_norm,
	)
	# dmask is None if dropout_p == 0.0
	# xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
	return zmat, xmat if xmat is not None else x0mat, dmask, mu, rsigma


	def _dropout_add_layer_norm_subset_backward(
	dz,
	dx,
	x,
	x0,
	dmask,
	mu,
	rsigma,
	gamma,
	colscale,
	x0_subset,
	out_subset,
	dropout_p,
	rowscale_const,
	x0_numrows,
	has_residual,
	is_rms_norm=False,
	):
	"""Assume that arguments are contiguous and aligned to 16 bytes
	dx == None means that it was a post-norm architecture
	(x = drop(x0) + residual was not returned in the fwd).
	x0 must not be None if we have colscale.
	"""
	hidden_size = gamma.numel()
	xmat = x.view((-1, hidden_size))
	dzmat = dz.view(-1, hidden_size)
	dxmat = dx.view(xmat.shape) if dx is not None else None
	x0mat = x0.view((-1, hidden_size)) if x0 is not None else None
	x0_subset = x0_subset.view(-1) if x0_subset is not None else None
	out_subset = out_subset.view(-1) if out_subset is not None else None
	if colscale is not None:
	assert x0 is not None, "x0 is required to compute the gradient of colscale"
	dx0mat, dresidualmat, dgamma, dbeta, _, _, *rest = dropout_layer_norm.dropout_add_ln_bwd(
	dzmat,
	dxmat,
	xmat,
	x0mat,
	dmask,
	mu,
	rsigma,
	gamma,
	None,
	colscale,
	x0_subset,
	out_subset,
	dropout_p,
	rowscale_const,
	x0_numrows,
	has_residual,
	is_rms_norm,
	)
	# dresidualmat is None if not has_residual
	if colscale is None:
	return dx0mat, dresidualmat, dgamma, dbeta
	else:
	dcolscale = rest[0]
	return dx0mat, dresidualmat, dgamma, dbeta, dcolscale


	def _dropout_add_layer_norm_parallel_residual_forward(
	x0,
	x1,
	residual,
	gamma0,
	beta0,
	gamma1,
	beta1,
	dropout_p,
	epsilon,
	residual_in_fp32=False,
	is_rms_norm=False,
	):
	"""Assume that arguments are contiguous and aligned to 16 bytes"""
	hidden_size = gamma0.numel()
	x0mat = x0.view((-1, hidden_size))
	x1mat = x1.view((-1, hidden_size)) if x1 is not None else None
	residualmat = residual.view((-1, hidden_size)) if residual is not None else None
	(
	z0mat,
	z1mat,
	xmat,
	dmask0,
	dmask1,
	mu,
	rsigma,
	) = dropout_layer_norm.dropout_add_ln_parallel_residual_fwd(
	x0mat,
	x1mat,
	residualmat,
	gamma0,
	beta0,
	gamma1,
	beta1,
	dropout_p,
	epsilon,
	None,
	residual_in_fp32,
	is_rms_norm,
	)
	# dmask0 and dmask1 are None if dropout_p == 0.0
	# xmat is None if dropout_p == 0.0 and residual is None and residual_dtype != input_dtype
	return z0mat, z1mat, xmat if xmat is not None else x0mat, dmask0, dmask1, mu, rsigma


	def _dropout_add_layer_norm_parallel_residual_backward(
	dz0,
	dz1,
	dx,
	x,
	dmask0,
	dmask1,
	mu,
	rsigma,
	gamma0,
	gamma1,
	dropout_p,
	has_x1,
	has_residual,
	is_rms_norm=False,
	):
	"""Assume that arguments are contiguous and aligned to 16 bytes
	dx == None means that it was a post-norm architecture
	(x = drop(x0) + residual was not returned in the fwd).
	"""
	hidden_size = gamma0.numel()
	xmat = x.view((-1, hidden_size))
	dz0mat = dz0.view(xmat.shape)
	dz1mat = dz1.view(xmat.shape) if dz1 is not None else None
	dxmat = dx.view(xmat.shape) if dx is not None else None
	(
	dx0mat,
	dx1mat,
	dresidualmat,
	dgamma0,
	dbeta0,
	dgamma1,
	dbeta1,
	*rest,
	) = dropout_layer_norm.dropout_add_ln_parallel_residual_bwd(
	dz0mat,
	dz1mat,
	dxmat,
	xmat,
	dmask0,
	dmask1,
	mu,
	rsigma,
	gamma0,
	gamma1,
	dropout_p,
	has_x1,
	has_residual,
	is_rms_norm,
	)
	# dresidualmat is None if not has_residual
	return dx0mat, dx1mat, dresidualmat, dgamma0, dbeta0, dgamma1, dbeta1


	class DropoutAddLayerNormFn(torch.autograd.Function):
	@staticmethod
	def forward(
	ctx,
	x0,
	residual,
	gamma,
	beta,
	rowscale,
	colscale,
	dropout_p,
	epsilon,
	residual_in_fp32=False,
	prenorm=False,
	is_rms_norm=False,
	return_dmask=False,
	):
	x0 = maybe_align(x0.contiguous(), 16)
	residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
	gamma = maybe_align(gamma.contiguous(), 16)
	beta = maybe_align(beta.contiguous(), 16) if beta is not None else None
	rowscale = maybe_align(rowscale.contiguous(), 16) if rowscale is not None else None
	colscale = maybe_align(colscale.contiguous(), 16) if colscale is not None else None
	zmat, xmat, dmask, mu, rsigma = _dropout_add_layer_norm_forward(
	x0,
	residual,
	gamma,
	beta,
	rowscale,
	colscale,
	dropout_p,
	epsilon,
	residual_in_fp32,
	is_rms_norm,
	)
	# Only need to save x0 if we need to compute gradient wrt colscale
	x0_saved = x0 if colscale is not None else None
	ctx.save_for_backward(
	xmat.view(x0.shape), x0_saved, dmask, gamma, mu, rsigma, rowscale, colscale
	)
	ctx.prenorm = prenorm
	ctx.dropout_p = dropout_p
	ctx.has_residual = residual is not None
	ctx.is_rms_norm = is_rms_norm
	ctx.has_beta = beta is not None
	if not return_dmask:
	return (
	zmat.view(x0.shape) if not prenorm else (zmat.view(x0.shape), xmat.view(x0.shape))
	)
	else:
	dmask = (
	dmask.view(x0.shape)
	if dropout_p > 0.0
	else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
	)
	ctx.mark_non_differentiable(dmask)
	return (
	(zmat.view(x0.shape), dmask)
	if not prenorm
	else (zmat.view(x0.shape), xmat.view(x0.shape), dmask)
	)

	@staticmethod
	def backward(ctx, dz, *args):
	# assert dz.is_contiguous()
	dz = maybe_align(dz.contiguous(), 16) # this happens!
	dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
	x, x0, dmask, gamma, mu, rsigma, rowscale, colscale = ctx.saved_tensors
	# x0 is None if colscale is None
	dropout_p = ctx.dropout_p
	has_residual = ctx.has_residual
	dx0mat, dresidualmat, dgamma, dbeta, *rest = _dropout_add_layer_norm_backward(
	dz,
	dx,
	x,
	x0,
	dmask,
	mu,
	rsigma,
	gamma,
	rowscale,
	colscale,
	dropout_p,
	has_residual,
	ctx.is_rms_norm,
	)
	dx0 = dx0mat.view(x.shape)
	dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
	dcolscale = rest[0] if colscale is not None else None
	return (
	dx0,
	dresidual,
	dgamma,
	dbeta if ctx.has_beta else None,
	None,
	dcolscale,
	None,
	None,
	None,
	None,
	None,
	None,
	)


	class DropoutAddLayerNormSubsetFn(torch.autograd.Function):
	@staticmethod
	def forward(
	ctx,
	x0,
	residual,
	gamma,
	beta,
	colscale,
	x0_subset,
	out_subset,
	dropout_p,
	epsilon,
	rowscale_const,
	out_numrows,
	residual_in_fp32=False,
	prenorm=False,
	is_rms_norm=False,
	return_dmask=False,
	):
	x0 = maybe_align(x0.contiguous(), 16)
	residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
	gamma = maybe_align(gamma.contiguous(), 16)
	beta = maybe_align(beta.contiguous(), 16) if beta is not None else None
	colscale = maybe_align(colscale.contiguous(), 16) if colscale is not None else None
	zmat, xmat, dmask, mu, rsigma = _dropout_add_layer_norm_subset_forward(
	x0,
	residual,
	gamma,
	beta,
	colscale,
	x0_subset,
	out_subset,
	dropout_p,
	epsilon,
	rowscale_const,
	out_numrows,
	residual_in_fp32,
	is_rms_norm,
	)
	# Only need to save x0 if we need to compute gradient wrt colscale
	x0_saved = x0 if colscale is not None else None
	x_shape = (-1, *x0.shape[1:])
	ctx.save_for_backward(
	xmat.view(x_shape), x0_saved, dmask, gamma, mu, rsigma, colscale, x0_subset, out_subset
	)
	ctx.prenorm = prenorm
	ctx.dropout_p = dropout_p
	ctx.rowscale_const = rowscale_const
	ctx.x0_numrows = x0.shape[:-1].numel()
	ctx.has_residual = residual is not None
	ctx.is_rms_norm = is_rms_norm
	ctx.has_beta = beta is not None
	z_shape = (-1, *x0.shape[1:])
	if not return_dmask:
	return zmat.view(z_shape) if not prenorm else (zmat.view(z_shape), xmat.view(x0.shape))
	else:
	z = zmat.view(z_shape)
	dmask = (
	dmask.view(x0.shape)
	if dropout_p > 0.0
	else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
	)
	ctx.mark_non_differentiable(dmask)
	return (z, dmask) if not prenorm else (z, xmat.view(x_shape), dmask)

	@staticmethod
	def backward(ctx, dz, *args):
	# assert dz.is_contiguous()
	dz = maybe_align(dz.contiguous(), 16) # this happens!
	dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
	x, x0, dmask, gamma, mu, rsigma, colscale, x0_subset, out_subset = ctx.saved_tensors
	# x0 is None if colscale is None
	dropout_p = ctx.dropout_p
	has_residual = ctx.has_residual
	dx0mat, dresidualmat, dgamma, dbeta, *rest = _dropout_add_layer_norm_subset_backward(
	dz,
	dx,
	x,
	x0,
	dmask,
	mu,
	rsigma,
	gamma,
	colscale,
	x0_subset,
	out_subset,
	dropout_p,
	ctx.rowscale_const,
	ctx.x0_numrows,
	has_residual,
	ctx.is_rms_norm,
	)
	dx0 = dx0mat.view(-1, *x.shape[1:])
	dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
	dcolscale = rest[0] if colscale is not None else None
	return (
	dx0,
	dresidual,
	dgamma,
	dbeta if ctx.has_beta else None,
	dcolscale,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	)


	class DropoutAddLayerNormParallelResidualFn(torch.autograd.Function):
	@staticmethod
	def forward(
	ctx,
	x0,
	x1,
	residual,
	gamma0,
	beta0,
	gamma1,
	beta1,
	dropout_p,
	epsilon,
	residual_in_fp32=False,
	prenorm=False,
	is_rms_norm=False,
	return_dmask=False,
	):
	x0 = maybe_align(x0.contiguous(), 16)
	x1 = maybe_align(x1.contiguous(), 16) if x1 is not None else None
	residual = maybe_align(residual.contiguous(), 16) if residual is not None else None
	gamma0 = maybe_align(gamma0.contiguous(), 16)
	beta0 = maybe_align(beta0.contiguous(), 16) if beta0 is not None else None
	gamma1 = maybe_align(gamma1.contiguous(), 16) if gamma1 is not None else None
	beta1 = maybe_align(beta1.contiguous(), 16) if beta1 is not None else None
	(
	z0mat,
	z1mat,
	xmat,
	dmask0,
	dmask1,
	mu,
	rsigma,
	) = _dropout_add_layer_norm_parallel_residual_forward(
	x0,
	x1,
	residual,
	gamma0,
	beta0,
	gamma1,
	beta1,
	dropout_p,
	epsilon,
	residual_in_fp32,
	is_rms_norm,
	)
	ctx.save_for_backward(xmat.view(x0.shape), dmask0, dmask1, gamma0, gamma1, mu, rsigma)
	ctx.prenorm = prenorm
	ctx.dropout_p = dropout_p
	ctx.has_x1 = x1 is not None
	ctx.has_residual = residual is not None
	ctx.is_rms_norm = is_rms_norm
	ctx.has_beta = beta0 is not None
	z = (z0mat.view(x0.shape), z1mat.view(x0.shape) if z1mat is not None else None)
	if not return_dmask:
	return z if not prenorm else (*z, xmat.view(x0.shape))
	else:
	dmask0 = (
	dmask0.view(x0.shape)
	if dropout_p > 0.0
	else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
	)
	dmask1 = (
	dmask1.view(x0.shape)
	if dropout_p > 0.0 and x1 is not None
	else torch.ones(x0.shape, dtype=torch.uint8, device=x0.device)
	)
	ctx.mark_non_differentiable(dmask0)
	ctx.mark_non_differentiable(dmask1)
	return (
	(z, dmask0, dmask1) if not prenorm else (z, xmat.view(x0.shape), dmask0, dmask1)
	)

	@staticmethod
	def backward(ctx, dz0, dz1, *args):
	dz0 = maybe_align(dz0.contiguous(), 16) # this happens!
	dz1 = maybe_align(dz1.contiguous(), 16) if dz1 is not None else None
	dx = maybe_align(args[0].contiguous(), 16) if ctx.prenorm else None
	x, dmask0, dmask1, gamma0, gamma1, mu, rsigma = ctx.saved_tensors
	dropout_p = ctx.dropout_p
	has_x1 = ctx.has_x1
	has_residual = ctx.has_residual
	(
	dx0mat,
	dx1mat,
	dresidualmat,
	dgamma0,
	dbeta0,
	dgamma1,
	dbeta1,
	) = _dropout_add_layer_norm_parallel_residual_backward(
	dz0,
	dz1,
	dx,
	x,
	dmask0,
	dmask1,
	mu,
	rsigma,
	gamma0,
	gamma1,
	dropout_p,
	has_x1,
	has_residual,
	ctx.is_rms_norm,
	)
	dx0 = dx0mat.view(x.shape)
	dx1 = dx1mat.view(x.shape) if dx1mat is not None else None
	dresidual = dresidualmat.view(x.shape) if dresidualmat is not None else None
	return (
	dx0,
	dx1,
	dresidual,
	dgamma0,
	dbeta0 if ctx.has_beta else None,
	dgamma1,
	dbeta1 if ctx.has_beta else None,
	None,
	None,
	None,
	None,
	None,
	None,
	)


	def layer_norm(x, weight, bias, epsilon):
	return DropoutAddLayerNormFn.apply(x, None, weight, bias, None, None, 0.0, epsilon, False)


	def dropout_add_layer_norm(
	x0,
	residual,
	weight,
	bias,
	dropout_p,
	epsilon,
	rowscale=None,
	layerscale=None,
	prenorm=False,
	residual_in_fp32=False,
	return_dropout_mask=False,
	):
	"""residual_in_fp32 only has an effect if residual is None.
	Otherwise residual dtype is residual.dtype.
	"""
	return DropoutAddLayerNormFn.apply(
	x0,
	residual,
	weight,
	bias,
	rowscale,
	layerscale,
	dropout_p,
	epsilon,
	residual_in_fp32,
	prenorm,
	False,
	return_dropout_mask,
	)


	def dropout_add_layer_norm_subset(
	x0,
	residual,
	weight,
	bias,
	dropout_p,
	epsilon,
	layerscale=None,
	x0_subset=None,
	out_subset=None,
	rowscale_const=1.0,
	out_numrows=0,
	prenorm=False,
	residual_in_fp32=False,
	return_dropout_mask=False,
	):
	"""residual_in_fp32 only has an effect if residual is None.
	Otherwise residual dtype is residual.dtype.
	"""
	return DropoutAddLayerNormSubsetFn.apply(
	x0,
	residual,
	weight,
	bias,
	layerscale,
	x0_subset,
	out_subset,
	dropout_p,
	epsilon,
	rowscale_const,
	out_numrows,
	residual_in_fp32,
	prenorm,
	False,
	return_dropout_mask,
	)


	def dropout_add_layer_norm_parallel_residual(
	x0,
	x1,
	residual,
	weight0,
	bias0,
	weight1,
	bias1,
	dropout_p,
	epsilon,
	prenorm=False,
	residual_in_fp32=False,
	return_dropout_mask=False,
	):
	"""residual_in_fp32 only has an effect if residual is None.
	Otherwise residual dtype is residual.dtype.
	"""
	return DropoutAddLayerNormParallelResidualFn.apply(
	x0,
	x1,
	residual,
	weight0,
	bias0,
	weight1,
	bias1,
	dropout_p,
	epsilon,
	residual_in_fp32,
	prenorm,
	False,
	return_dropout_mask,
	)


	class DropoutAddLayerNorm(torch.nn.Module):
	def __init__(
	self,
	hidden_size,
	prenorm=False,
	p=0.0,
	eps=1e-5,
	residual_in_fp32=False,
	device=None,
	dtype=None,
	):
	factory_kwargs = {"device": device, "dtype": dtype}
	super().__init__()
	self.prenorm = prenorm
	self.p = p
	self.eps = eps
	self.residual_in_fp32 = residual_in_fp32
	self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
	self.bias = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
	self.reset_parameters()

	def reset_parameters(self):
	init.ones_(self.weight)
	init.zeros_(self.bias)

	def forward(self, x0, residual=None):
	return dropout_add_layer_norm(
	x0,
	residual,
	self.weight,
	self.bias,
	self.p if self.training else 0.0,
	self.eps,
	prenorm=self.prenorm,
	residual_in_fp32=self.residual_in_fp32,
	)

	def rms_norm(x, weight, epsilon):
	return DropoutAddLayerNormFn.apply(
	x, None, weight, None, None, None, 0.0, epsilon, False, False, True
	)
	class FusedRMSNorm(torch.nn.Module):
	def __init__(self, size: int, dim: int = -1, eps: float = 1e-5):
	super().__init__()
	self.eps = eps
	self.weight = torch.nn.Parameter(torch.ones(size))
	self.dim = dim
	self.reset_parameters()

	def reset_parameters(self):
	init.ones_(self.weight)

	def forward(self, x):
	return rms_norm(x, self.weight, self.eps)


	class RMSNorm(torch.nn.Module):
	"""Root Mean Square Layer Normalization.

	Derived from https://github.com/bzhangGo/rmsnorm/blob/master/rmsnorm_torch.py. BSD 3-Clause License:
	https://github.com/bzhangGo/rmsnorm/blob/master/LICENSE.
	"""

	def __init__(self, size: int, dim: int = -1, eps: float = 1e-5) -> None:
	super().__init__()
	self.weight = torch.nn.Parameter(torch.ones(size))
	self.eps = eps
	self.dim = dim

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	# NOTE: the original RMSNorm paper implementation is not equivalent
	norm_x = torch.mean(x * x, dim=self.dim, keepdim=True)
	x_normed = x * torch.rsqrt(norm_x + self.eps)
	return self.weight * x_normed

	def reset_parameters(self):
	torch.nn.init.ones_(self.weight)







	######################################
	#rope_emb
	######################################







	# Copyright (c) 2023, Tri Dao.

	import math
	from typing import Optional, Tuple

	import rotary_emb
	import torch
	from einops import rearrange, repeat

	class ApplyRotaryEmb(torch.autograd.Function):
	@staticmethod
	def forward(ctx, x, cos, sin, interleaved=False, inplace=False,future_token = 0):
	"""
	x: (batch_size, seqlen, nheads, headdim)
	cos, sin: (seqlen, rotary_dim / 2)
	interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
	of 1st half and 2nd half (GPT-NeoX style).
	rotary_dim must be <= headdim
	Apply rotary embedding to the first rotary_dim of x.
	"""
	batch, seqlen, nheads, headdim = x.shape
	rotary_seqlen, rotary_dim = cos.shape
	rotary_dim *= 2


	# print('谁纸盘仲裁',x.shape,cos.shape)
	# 谁纸盘仲裁 torch.Size([224, 96, 12, 64]) torch.Size([1, 32])
	# 谁纸盘仲裁 2049 2048
	assert rotary_dim <= headdim
	# print(seqlen,rotary_seqlen)
	assert seqlen <= rotary_seqlen
	assert sin.shape == (rotary_seqlen, rotary_dim // 2)
	x_ro = x[..., :rotary_dim]
	x1, x2 = x_ro.chunk(2, dim=-1) if not interleaved else (x_ro[..., ::2], x_ro[..., 1::2])
	out = torch.empty_like(x) if not inplace else x
	out_ro = out[..., :rotary_dim]
	if inplace:
	o1, o2 = x1, x2
	else:
	o1, o2 = (
	out_ro.chunk(2, dim=-1)
	if not interleaved
	else (out_ro[..., ::2], out_ro[..., 1::2])
	)
	rotary_emb.apply_rotary(
	x1,
	x2,
	rearrange(cos[:seqlen], "s d -> s 1 d"),
	rearrange(sin[:seqlen], "s d -> s 1 d"),
	o1,
	o2,
	False,
	)
	if not inplace and rotary_dim < headdim:
	out[..., rotary_dim:].copy_(x[..., rotary_dim:])
	ctx.save_for_backward(cos, sin)
	ctx.interleaved = interleaved
	ctx.inplace = inplace
	return out if not inplace else x

	@staticmethod
	def backward(ctx, do):
	cos, sin = ctx.saved_tensors
	_, seqlen, _, headdim = do.shape
	rotary_dim = cos.shape[-1]
	rotary_dim *= 2
	inplace = ctx.inplace
	do_ro = do[..., :rotary_dim]
	do1, do2 = (
	do_ro.chunk(2, dim=-1) if not ctx.interleaved else (do_ro[..., ::2], do_ro[..., 1::2])
	)
	dx = torch.empty_like(do) if not inplace else do
	if inplace:
	dx1, dx2 = do1, do2
	else:
	dx_ro = dx[..., :rotary_dim]
	dx1, dx2 = (
	dx_ro.chunk(2, dim=-1)
	if not ctx.interleaved
	else (dx_ro[..., ::2], dx_ro[..., 1::2])
	)
	rotary_emb.apply_rotary(
	do1,
	do2,
	rearrange(cos[:seqlen], "s d -> s 1 d"),
	rearrange(sin[:seqlen], "s d -> s 1 d"),
	dx1,
	dx2,
	True,
	)
	if not inplace and rotary_dim < headdim:
	dx[..., rotary_dim:].copy_(do[..., rotary_dim:])
	return dx, None, None, None, None


	apply_rotary_emb_func = ApplyRotaryEmb.apply