semo / model /model_AMD.py

Upload folder using huggingface_hub

bd546bf verified 9 months ago

43.1 kB

	import torch
	from torch import nn
	import einops
	from typing import Tuple
	import random
	import numpy as np
	from tqdm import tqdm
	from .modules import DuoFrameDownEncoder,Upsampler,MapConv,MotionDownEncoder
	from .loss import l1,l2
	from .transformer import (MotionTransformer,
	AMDDiffusionTransformerModel,
	MotionEncoderLearnTokenTransformer,
	AMDReconstructTransformerModel,
	AMDDiffusionTransformerModelDualStream,
	AMDDiffusionTransformerModelImgSpatial,
	AMDDiffusionTransformerModelImgSpatialDoubleRef,
	AMDReconstructTransformerModelSpatial)
	from .rectified_flow import RectifiedFlow
	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.models.modeling_utils import ModelMixin
	from diffusers.models.resnet import ResnetBlock2D
	import einops
	import torch.nn.functional as F

	from diffusers.utils import export_to_gif

	class AMDModel(ModelMixin, ConfigMixin):
	_supports_gradient_checkpointing = True

	@register_to_config
	def __init__(self,
	image_inchannel :int = 4,
	image_height :int = 32,
	image_width :int = 32,
	video_frames :int = 16,
	scheduler_num_step :int = 1000,

	# ----------- MotionEncoder -----------
	motion_token_num:int = 12,
	motion_token_channel: int = 128,
	enc_num_layers:int = 8,
	enc_nhead:int = 8,
	enc_ndim:int = 64,
	enc_dropout:float = 0.0,
	motion_need_norm_out:bool = False,

	# ----------- MotionTransformer ---------
	need_motion_transformer :bool = False,
	motion_transformer_attn_head_dim:int = 64,
	motion_transformer_attn_num_heads:int = 16,
	motion_transformer_num_layers:int = 4,

	# ----------- Diffusion Transformer -----------
	diffusion_model_type : str = 'default', # or dual
	diffusion_attn_head_dim : int = 64,
	diffusion_attn_num_heads : int = 16,
	diffusion_out_channels : int = 4,
	diffusion_num_layers : int = 16,
	image_patch_size : int = 2,
	motion_patch_size : int = 1,
	motion_drop_ratio: float = 0.0,
	refimg_drop: bool = False,

	# ----------- Sample --------------
	extract_motion_with_motion_transformer = False,
	**kwargs,
	):
	super().__init__()

	# setting
	self.num_step = scheduler_num_step
	self.scheduler = RectifiedFlow(num_steps=scheduler_num_step)
	self.need_motion_transformer = need_motion_transformer
	self.extract_motion_with_motion_transformer = extract_motion_with_motion_transformer
	self.diffusion_model_type = diffusion_model_type
	self.target_frame = video_frames
	self.refimg_drop = refimg_drop

	# motion Encoder
	self.motion_encoder = MotionEncoderLearnTokenTransformer(img_height = image_height,
	img_width=image_width,
	img_inchannel=image_inchannel,
	img_patch_size = image_patch_size,
	motion_token_num = motion_token_num,
	motion_channel = motion_token_channel,
	need_norm_out = motion_need_norm_out,
	# ----- attention
	num_attention_heads=enc_nhead,
	attention_head_dim=enc_ndim,
	num_layers=enc_num_layers,
	dropout=enc_dropout,
	attention_bias= True,)

	# motion transformer
	if need_motion_transformer:
	self.motion_transformer = MotionTransformer(motion_token_num=motion_token_num,
	motion_token_channel=motion_token_channel,
	attention_head_dim=motion_transformer_attn_head_dim,
	num_attention_heads=motion_transformer_attn_num_heads,
	num_layers=motion_transformer_num_layers,)

	# diffusion transformer

	if diffusion_model_type == 'default':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.diffusion_transformer = AMDDiffusionTransformerModel(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,)
	elif diffusion_model_type == 'dual':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.diffusion_transformer = AMDDiffusionTransformerModelDualStream(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,
	motion_target_num_frame = video_frames)
	elif diffusion_model_type == 'spatial':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.diffusion_transformer = AMDDiffusionTransformerModelImgSpatial(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,
	motion_target_num_frame = video_frames)
	elif diffusion_model_type == 'doubleref':
	dit_image_inchannel = image_inchannel
	self.diffusion_transformer = AMDDiffusionTransformerModelImgSpatialDoubleRef(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,
	motion_target_num_frame = video_frames)
	else:
	raise IndexError

	def forward(self,
	video:torch.tensor,
	ref_img:torch.Tensor ,
	randomref_img:torch.Tensor = None,
	time_step:torch.tensor = None,
	return_meta_info=False,
	mask_ratio=None,
	**kwargs,):
	"""
	Args:
	video: (N,T,C,H,W)
	ref_img: (N,T,C,H,W)
	randomref_img : (N,T,C,H,W)
	"""

	device = video.device
	n,t,c,h,w = video.shape

	assert video.shape == ref_img.shape ,f'video.shape:{video.shape}should be equal to ref_img.shape:{ref_img.shape}'
	if self.diffusion_model_type == 'doubleref' :
	assert randomref_img is not None, "when diffusion_model_type == doubleref, randomref_img should be given"

	# motion encoder
	if mask_ratio is not None:
	mask_ratio = torch.rand(1).item() * mask_ratio

	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	if randomref_img.dim()==4:
	randomref_img = randomref_img.unsqueeze(1).repeat(1,t,1,1,1)
	refimg_and_video = torch.cat([randomref_img,video],dim=1)# (n,t+t,C,H,W)
	else:
	refimg_and_video = torch.cat([ref_img,video],dim=1)# (n,t+t,C,H,W)
	motion = self.motion_encoder(refimg_and_video,mask_ratio) # (n,t+t,l,d)

	source_motion = motion[:,:t].flatten(0,1) # (NT,motion_token,d)
	target_motion = motion[:,t:].flatten(0,1) # (NT,motion_token,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)


	# prepare for Diffusion Transformer
	zi = ref_img.flatten(0,1) # (NT,C,H,W)
	zj = video.flatten(0,1) # (NT,C,H,W)
	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	randomref_img = randomref_img.flatten(0,1) # (NT,C,H,W)

	if time_step is None:
	time_step = self.prepare_timestep(batch_size= zj.shape[0],device= device) #(b,)
	if self.diffusion_model_type != 'default':
	time_step = self.prepare_timestep(batch_size= n,device= device) # (n,)
	time_step = time_step.repeat_interleave(t) # (b,)
	zt,vel = self.scheduler.get_train_tuple(z1=zj,time_step=time_step) # (NT,C,H,W),(NT,C,H,W)

	# dit forward
	if self.refimg_drop:
	zi = torch.zeros_like(zi).to(video.device)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)


	pre = self.diffusion_transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,
	randomref_image_hidden_states = randomref_img,
	timestep = time_step,)

	# loss
	diff_loss = l2(pre,vel)

	rec_zj = self.scheduler.get_target_with_zt_vel(zt,pre,time_step)
	rec_loss = l2(rec_zj,zj)

	loss = diff_loss

	loss_dict = {'loss':loss,'diff_loss':diff_loss,'rec_loss':rec_loss}

	if return_meta_info:
	return {'motion' : motion, # (,t,motion_out_channels,h,w) , output of motion transformer
	'zi' : zi, # (b,C,H,W) \| b = n * t
	'zj' : zj, # (b,C,H,W)
	'zt' : zt, # (b,C,H,W)
	'gt' : vel, # (b,C,H,W)
	'pre': pre, # (b,C,H,W)
	'time_step': time_step, # (b,)
	}
	else:
	return pre,vel,loss_dict # (b,C,H,W)
	def get_noise_latent_pair(self,
	video:torch.Tensor,
	ref_img:torch.Tensor ,
	randomref_img:torch.Tensor,
	sample_step:int = 50,
	):
	pass

	@torch.no_grad()
	def sample(self,video:torch.Tensor,
	ref_img:torch.Tensor ,
	randomref_img:torch.Tensor = None,
	sample_step:int = 50,
	mask_ratio = None,
	start_step:int = None,
	return_meta_info=False,
	**kwargs,):

	device = video.device
	n,t,c,h,w = video.shape

	if start_step is None:
	start_step = self.scheduler.num_step
	assert start_step <= self.scheduler.num_step , 'start_step cant be larger than scheduler.num_step'

	if self.diffusion_model_type == 'doubleref' :
	assert randomref_img is not None, "when diffusion_model_type == doubleref, randomref_img should be given"

	if ref_img.dim()==4:
	ref_img = ref_img.unsqueeze(1).repeat(1,t,1,1,1)

	# motion encoder
	if mask_ratio is not None:
	print(f'* Sampling with Mask_Ratio = {mask_ratio}')
	mask_ratio = mask_ratio

	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	if randomref_img.dim()==4:
	randomref_img = randomref_img.unsqueeze(1).repeat(1,t,1,1,1)
	refimg_and_video = torch.cat([randomref_img,video],dim=1)# (n,t+t,C,H,W)
	else:
	refimg_and_video = torch.cat([ref_img,video],dim=1)# (n,t+t,C,H,W)

	motion = self.motion_encoder(refimg_and_video,mask_ratio) # (n,t+t,motion_out_channels,h,w)

	source_motion = motion[:,:t].flatten(0,1) # (NT,motion_token,d)
	target_motion = motion[:,t:].flatten(0,1) # (NT,motion_token,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)

	# prepare for Diffusion Transformer
	time_step = torch.ones((source_motion.shape[0],)).to(device)
	time_step = time_step * start_step

	zi = ref_img.flatten(0,1) # (NT,C,H,W)
	zj = video.flatten(0,1) # (NT,C,H,W)
	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	randomref_img = randomref_img.flatten(0,1) # (NT,C,H,W)
	zt,vel = self.scheduler.get_train_tuple(z1=zj,time_step=time_step) # (NT,C,H,W),(NT,C,H,W)
	noise = zj - vel
	# Sample Loop
	pre_cache = []
	sample_cache = []

	# 1.step_seq
	step_seq = np.linspace(0, start_step, num=sample_step+1, endpoint=True,dtype=int) # [0,5,10,15,....,start_step]
	step_seq = list(reversed(step_seq[1:])) # delete step:0 [start_step,.....,15,10,5]

	# 2.Euler step
	dt = 1./sample_step

	if self.refimg_drop:
	zi = torch.zeros_like(zi).to(video.device)

	for i in tqdm(step_seq):
	# time_step
	time_step = torch.ones((zt.shape[0],)).to(zt.device)
	time_step = time_step * i

	# input
	zt = zt.to(video.dtype)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)

	# forward
	pre = self.diffusion_transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,
	randomref_image_hidden_states = randomref_img,
	timestep = time_step,)
	zt = zt + pre * dt
	pre_cache.append(pre)
	sample_cache.append(zt)

	zi = einops.rearrange(zi,'(n t) c h w -> n t c h w',n=n)
	zt = einops.rearrange(zt,'(n t) c h w -> n t c h w',n=n)
	zj = einops.rearrange(zj,'(n t) c h w -> n t c h w',n=n)

	if return_meta_info:
	return {'zi' : zi, # (b,1,c,h,w)
	'zj' : zj, # (b,1,c,h,w)
	'sample' : zt, # (b,1,c,h,w)
	'pre_cache' : pre_cache, # [(b,c,h,w),....]
	'sample_cache' : sample_cache, # [(b,c,h,w),....]
	'step_seq' : step_seq,
	'motion' : target_motion, # (b,C,H,W),
	"noise" : noise
	}
	else:
	return zi,zt,zj # (n,t,c,h,w)

	@torch.no_grad()
	def sample_with_refimg_motion(self,
	ref_img:torch.Tensor,
	motion=torch.Tensor,
	randomref_img:torch.Tensor = None,
	sample_step:int = 10,
	mask_ratio = None,
	return_meta_info=False,
	**kwargs,):
	"""
	Args:
	ref_img : (N,C,H,W)
	randomref_img : (N,C,H,W)
	motion : (N,F,L,D)
	Return:
	video : (N,T,C,H,W)
	"""
	device = motion.device
	n,t,l,d = motion.shape

	start_step = self.scheduler.num_step

	# motion encoder
	refimg = ref_img.unsqueeze(1) # (N,1,C,H,W)
	if self.diffusion_model_type == 'doubleref' :
	assert randomref_img is not None, "when diffusion_model_type == doubleref, randomref_img should be given"

	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	print('* Warnning * diffusion_model_type:doubleref')
	if randomref_img.dim()==4:
	randomref_img = randomref_img.unsqueeze(1) # (N,1,C,H,W)
	source_motion = self.motion_encoder(randomref_img,mask_ratio) # (n,1,motion_token,d)
	else:
	source_motion = self.motion_encoder(refimg,mask_ratio) # (n,1,motion_token,d)

	source_motion = source_motion.repeat(1,t,1,1).flatten(0,1) # (NT,l,d)
	target_motion = motion.flatten(0,1) # (NT,l,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer and not self.extract_motion_with_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)

	# prepare for Diffusion Transformer
	time_step = torch.ones((source_motion.shape[0],)).to(device)
	time_step = time_step * start_step

	zi = refimg.repeat(1,t,1,1,1).flatten(0,1) # (NT,C,H,W)
	zj = zi
	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	randomref_img = randomref_img.repeat(1,t,1,1,1)
	randomref_img = randomref_img.flatten(0,1) # (NT,C,H,W)
	# zt,vel = self.scheduler.get_train_tuple(z1=zj,time_step=time_step) # (NT,C,H,W),(NT,C,H,W)
	zt = torch.randn_like(zj)

	# Sample Loop
	pre_cache = []
	sample_cache = []

	# 1.step_seq
	step_seq = np.linspace(0, start_step, num=sample_step+1, endpoint=True,dtype=int) # [0,5,10,15,....,start_step]
	step_seq = list(reversed(step_seq[1:])) # delete step:0 [start_step,.....,15,10,5]

	# 2.Euler step
	dt = 1./sample_step

	if self.refimg_drop:
	zi = torch.zeros_like(zi).to(ref_img.device)

	for i in tqdm(step_seq):
	# time_step
	time_step = torch.ones((zt.shape[0],)).to(zt.device)
	time_step = time_step * i

	# input
	zt = zt.to(ref_img.dtype)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)

	# forward
	pre = self.diffusion_transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,
	randomref_image_hidden_states = randomref_img,
	timestep = time_step,)

	zt = zt + pre * dt

	# unsqueeze (n,1,c,h,w) means images, (n,t,c,h,w) means video t>1 .
	zi = einops.rearrange(zi,'(n t) c h w -> n t c h w',n=n,t=t)
	zt = einops.rearrange(zt,'(n t) c h w -> n t c h w',n=n,t=t)

	if return_meta_info:
	return {'zi' : zi, # (b,1,c,h,w)
	'zj' : zj, # (b,1,c,h,w)
	'sample' : zt, # (b,1,c,h,w)
	'pre_cache' : pre_cache, # [(b,c,h,w),....]
	'sample_cache' : sample_cache, # [(b,c,h,w),....]
	'step_seq' : step_seq,
	'motion' : target_motion, # (b,C,H,W)
	}
	else:
	return zi,zt,zj # (b,1,c,h,w)



	def extract_motion(self,video:torch.tensor,mask_ratio=None):
	# video : (N,T,C,H,W)
	n,t,c,h,w = video.shape

	motion = self.motion_encoder(video,mask_ratio) # (N,T,L,D)

	if self.need_motion_transformer and self.extract_motion_with_motion_transformer:
	motion = self.motion_transformer(motion) # (N,T,L,D)

	return motion

	def prepare_timestep(self,batch_size:int,device,time_step = None):
	if time_step is not None:
	return time_step.to(device)
	else:
	return torch.randint(0,self.num_step+1,(batch_size,)).to(device)

	def prepare_encoder_input(self,video:torch.tensor):
	assert len(video.shape) == 5 , f'only support video data : 5D tensor , but got {video.shape}'

	# cat
	pre = video[:,:-1,:,:,:]
	post= video[:,1:,:,:,:]
	duo_frame_mix = torch.cat([pre,post],dim=2) # (b,t-1,2c,h,w)
	duo_frame_mix = einops.rearrange(duo_frame_mix,'b t c h w -> (b t) c h w')

	return duo_frame_mix # (b*f-1,2c,h,w)


	def unpatchify(self, x ,patch_size):
	"""
	x: (N, S, patch_size*2 C)
	imgs: (N, C, H, W)
	"""
	p = patch_size
	h = w = int(x.shape[1]**.5)
	# c = self.in_chans
	c = x.shape[2] // (p**2)
	assert h * w == x.shape[1]

	x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) # (N, h, w, p, p, c)
	x = torch.einsum('nhwpqc->nchpwq', x) # (N, c, h, p, w, p)
	imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
	return imgs #(N,C,H,W)

	def reset_infer_num_frame(self, num:int):
	old_num = self.diffusion_transformer.target_frame
	self.diffusion_transformer.target_frame = num
	print(f'* Reset infer frame from {old_num} to {self.diffusion_transformer.target_frame} *')


	class AMDModel_Rec(ModelMixin, ConfigMixin):
	_supports_gradient_checkpointing = True

	@register_to_config
	def __init__(self,
	image_inchannel :int = 4,
	image_height :int = 32,
	image_width :int = 32,
	video_frames :int = 16,
	scheduler_num_step :int = 1000,

	# ----------- MotionEncoder -----------
	motion_token_num:int = 12,
	motion_token_channel: int = 128,
	enc_num_layers:int = 8,
	enc_nhead:int = 8,
	enc_ndim:int = 64,
	enc_dropout:float = 0.0,
	motion_need_norm_out:bool = True,

	# ----------- MotionTransformer ---------
	need_motion_transformer :bool = False,
	motion_transformer_attn_head_dim:int = 64,
	motion_transformer_attn_num_heads:int = 16,
	motion_transformer_num_layers:int = 4,

	# ----------- Diffusion Transformer -----------
	diffusion_model_type : str = 'default', # or dual
	diffusion_attn_head_dim : int = 64,
	diffusion_attn_num_heads : int = 16,
	diffusion_out_channels : int = 4,
	diffusion_num_layers : int = 16,
	image_patch_size : int = 2,
	motion_patch_size : int = 1,
	motion_drop_ratio: float = 0.0,
	**kwargs,
	):
	super().__init__()

	# setting
	self.num_step = scheduler_num_step
	self.scheduler = RectifiedFlow(num_steps=scheduler_num_step)
	self.need_motion_transformer = need_motion_transformer

	# zt token
	INIT_CONST = 0.02
	self.zt_token = nn.Parameter(torch.randn(1, image_inchannel, image_height,image_width) * INIT_CONST)

	# motion Encoder
	self.motion_encoder = MotionEncoderLearnTokenTransformer(img_height = image_height,
	img_width=image_width,
	img_inchannel=image_inchannel,
	img_patch_size = image_patch_size,
	motion_token_num = motion_token_num,
	motion_channel = motion_token_channel,
	need_norm_out = motion_need_norm_out,
	# ----- attention
	num_attention_heads=enc_nhead,
	attention_head_dim=enc_ndim,
	num_layers=enc_num_layers,
	dropout=enc_dropout,
	attention_bias= True,)

	# motion transformer
	if need_motion_transformer:
	self.motion_transformer = MotionTransformer(motion_token_num=motion_token_num,
	motion_token_channel=motion_token_channel,
	attention_head_dim=motion_transformer_attn_head_dim,
	num_attention_heads=motion_transformer_attn_num_heads,
	num_layers=motion_transformer_num_layers,)

	# diffusion transformer
	if diffusion_model_type == 'default':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.transformer = AMDReconstructTransformerModel(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,)
	elif diffusion_model_type == 'spatial':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.transformer = AMDReconstructTransformerModelSpatial(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,
	motion_target_num_frame = video_frames)

	def forward(self,
	video:torch.tensor,
	ref_img:torch.Tensor ,
	time_step:torch.tensor = None,
	return_meta_info=False,
	**kwargs,):
	"""
	Args:
	video: (N,T,C,H,W)
	ref_img: (N,T,C,H,W)
	"""

	device = video.device
	n,t,c,h,w = video.shape

	assert video.shape == ref_img.shape ,f'video.shape:{video.shape}should be equal to ref_img.shape:{ref_img.shape}'

	# motion encoder
	refimg_and_video = torch.cat([ref_img,video],dim=1)# (n,t+t,C,H,W)
	motion = self.motion_encoder(refimg_and_video) # (n,t+t,l,d)

	source_motion = motion[:,:t].flatten(0,1) # (NT,motion_token,d)
	target_motion = motion[:,t:].flatten(0,1) # (NT,motion_token,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)


	# prepare for Diffusion Transformer
	zi = ref_img.flatten(0,1) # (NT,C,H,W)
	zj = video.flatten(0,1) # (NT,C,H,W)
	zt = self.zt_token.repeat(zj.shape[0],1,1,1) # (NT,C,H,W)

	# dit forward
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)
	pre = self.transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,)

	# loss
	rec_loss = l2(pre,zj)

	loss = rec_loss

	loss_dict = {'loss':loss,'rec_loss':rec_loss}

	if return_meta_info:
	return {'motion' : motion, # (,t,motion_out_channels,h,w) , output of motion transformer
	'zi' : zi, # (b,C,H,W) \| b = n * t
	'zj' : zj, # (b,C,H,W)
	'zt' : zt, # (b,C,H,W)
	'pre': pre, # (b,C,H,W)
	'time_step': time_step, # (b,)
	}
	else:
	return pre,zj,loss_dict # (b,C,H,W)

	@torch.no_grad()
	def sample(self,
	video:torch.tensor,
	ref_img:torch.Tensor ,
	sample_step:int = 50,
	start_step:int = None,
	return_meta_info=False,
	**kwargs,):

	device = video.device
	n,t,c,h,w = video.shape

	if start_step is None:
	start_step = self.scheduler.num_step
	assert start_step <= self.scheduler.num_step , 'start_step cant be larger than scheduler.num_step'

	# motion encoder
	refimg_and_video = torch.cat([ref_img,video],dim=1)# (n,t+t,C,H,W)
	motion = self.motion_encoder(refimg_and_video) # (n,t+t,motion_out_channels,h,w)

	source_motion = motion[:,:t].flatten(0,1) # (NT,motion_token,d)
	target_motion = motion[:,t:].flatten(0,1) # (NT,motion_token,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)

	zi = ref_img.flatten(0,1) # (NT,C,H,W)
	zj = video.flatten(0,1) # (NT,C,H,W)
	zt = self.zt_token.repeat(zj.shape[0],1,1,1) # (NT,C,H,W)


	# input
	zt = zt.to(video.dtype)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)

	# forward
	pre = self.transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,)

	zi = einops.rearrange(zi,'(n t) c h w -> n t c h w',n=n)
	zt = einops.rearrange(pre,'(n t) c h w -> n t c h w',n=n)
	zj = einops.rearrange(zj,'(n t) c h w -> n t c h w',n=n)

	if return_meta_info:
	return {'zi' : zi, # (b,1,c,h,w)
	'zj' : zj, # (b,1,c,h,w)
	}
	else:
	return zi,zt,zj # (n,t,c,h,w)

	def sample_with_refimg_motion(self,
	ref_img:torch.Tensor,
	motion=torch.Tensor,
	sample_step:int = 10,
	return_meta_info=False,
	**kwargs,):
	"""
	Args:
	ref_img : (N,C,H,W)
	motion : (N,F,L,D)
	Return:
	video : (N,T,C,H,W)
	"""
	device = motion.device
	n,t,l,d = motion.shape

	start_step = self.scheduler.num_step

	# motion encoder
	refimg = ref_img.unsqueeze(1) # (N,1,C,H,W)
	source_motion = self.motion_encoder(refimg) # (n,1,motion_token,d)

	source_motion = source_motion.repeat(1,t,1,1).flatten(0,1) # (NT,l,d)
	target_motion = motion.flatten(0,1) # (NT,l,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)

	# prepare for Diffusion Transformer
	time_step = torch.ones((source_motion.shape[0],)).to(device)
	time_step = time_step * start_step

	zi = refimg.repeat(1,t,1,1,1).flatten(0,1) # (NT,C,H,W)
	zj = zi
	zt = self.zt_token.repeat(zj.shape[0],1,1,1) # (NT,C,H,W)

	# input
	zt = zt.to(zj.dtype)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)

	# forward
	pre = self.transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,)

	zi = einops.rearrange(zi,'(n t) c h w -> n t c h w',n=n)
	zt = einops.rearrange(pre,'(n t) c h w -> n t c h w',n=n)
	zj = einops.rearrange(zj,'(n t) c h w -> n t c h w',n=n)

	if return_meta_info:
	return {'zi' : zi, # (b,1,c,h,w)
	'zj' : zj, # (b,1,c,h,w)
	}
	else:
	return zi,zt,zj # (b,1,c,h,w)

	def extract_motion(self,video:torch.tensor):
	# video : (N,T,C,H,W)

	# motion Encoder
	motion = self.motion_encoder(video) # (N,T,L,D)

	if self.need_motion_transformer:
	motion = self.motion_transformer(motion) # (N,T,L,D)


	return motion


	def AMD_S(**kwargs) -> AMDModel:
	return AMDModel(
	# ----------- motion encoder -----------
	enc_num_layers = 8,
	enc_nhead = 8,
	enc_ndim = 64,
	# ----------- Diffusion Transformer -----------
	diffusion_attn_head_dim = 64,
	diffusion_attn_num_heads = 16,
	diffusion_out_channels = 4,
	diffusion_num_layers = 12,
	**kwargs)

	def AMD_L(**kwargs) -> AMDModel:
	return AMDModel(
	# ----------- motion encoder -----------
	enc_num_layers = 8,
	enc_nhead = 16,
	enc_ndim = 64,
	# ----------- Diffusion Transformer -----------
	diffusion_attn_head_dim = 96,
	diffusion_attn_num_heads = 16,
	diffusion_out_channels = 4,
	diffusion_num_layers = 16,
	**kwargs)

	def AMD_S_Rec(**kwargs) -> AMDModel:
	return AMDModel_Rec(
	# ----------- motion encoder -----------
	enc_num_layers = 8,
	enc_nhead = 8,
	enc_ndim = 64,
	# ----------- Diffusion Transformer -----------
	diffusion_attn_head_dim = 64,
	diffusion_attn_num_heads = 16,
	diffusion_out_channels = 4,
	diffusion_num_layers = 12,
	**kwargs)

	def AMD_S_RecSplit(**kwargs) -> AMDModel:
	return AMDModel_Rec(
	# ----------- motion encoder -----------
	enc_num_layers = 8,
	enc_nhead = 8,
	enc_ndim = 64,
	# ----------- Diffusion Transformer -----------
	diffusion_attn_head_dim = 64,
	diffusion_attn_num_heads = 16,
	diffusion_out_channels = 4,
	diffusion_num_layers = 12,
	is_split = True,
	**kwargs)


	AMD_models = {
	"AMD_S": AMD_S, # 250M
	"AMD_L": AMD_L, # 700M
	"AMD_S_Rec": AMD_S_Rec, # 250M
	"AMD_S_RecSplit" : AMD_S_RecSplit, # 250M
	} # S 206 B 333 M 642 L 1053