Spaces:

hassanjbara
/

mdm

Build error

App Files Files Community

mdm / model /mdm.py

hassanjbara

update model

5007d4b 10 months ago

raw

history blame contribute delete

13.4 kB

	import clip
	import numpy as np
	import torch
	import torch.nn as nn

	from model.config import MDMConfig
	from model.rotation2xyz import Rotation2xyz


	class MDM(nn.Module):
	@classmethod
	def from_config(cls, config: MDMConfig):
	"""
	Instantiate MDM from an MDMConfig object.
	"""
	return cls(
	modeltype=config.modeltype,
	njoints=config.njoints,
	nfeats=config.nfeats,
	num_actions=config.num_actions,
	translation=config.translation,
	pose_rep=config.pose_rep,
	glob=config.glob,
	glob_rot=config.glob_rot,
	latent_dim=config.latent_dim,
	ff_size=config.ff_size,
	num_layers=config.layers,
	num_heads=config.num_heads,
	dropout=config.dropout,
	ablation=config.ablation,
	activation=config.activation,
	legacy=config.legacy,
	data_rep=config.data_rep,
	dataset=config.dataset,
	clip_dim=config.clip_dim,
	arch=config.arch,
	emb_trans_dec=config.emb_trans_dec,
	clip_version=config.clip_version,
	action_emb=config.action_emb,
	normalize_encoder_output=config.normalize_encoder_output,
	cond_mask_prob=config.cond_mask_prob,
	mask_frames=config.mask_frames,
	emb_policy=config.emb_policy,
	pos_embed_max_len=config.pos_embed_max_len,
	pred_len=config.pred_len,
	context_len=config.context_len,
	all_goal_joint_names=config.all_goal_joint_names,
	multi_target_cond=config.multi_target_cond,
	multi_encoder_type=config.multi_encoder_type,
	target_enc_layers=config.target_enc_layers,
	)

	def __init__(
	self,
	modeltype,
	njoints,
	nfeats,
	num_actions,
	translation,
	pose_rep,
	glob,
	glob_rot,
	latent_dim=256,
	ff_size=1024,
	num_layers=8,
	num_heads=4,
	dropout=0.1,
	ablation=None,
	activation="gelu",
	legacy=False,
	data_rep="rot6d",
	dataset="amass",
	clip_dim=512,
	arch="trans_enc",
	emb_trans_dec=False,
	clip_version=None,
	**kargs,
	):
	super().__init__()

	self.legacy = legacy
	self.modeltype = modeltype
	self.njoints = njoints
	self.nfeats = nfeats
	self.num_actions = num_actions
	self.data_rep = data_rep
	self.dataset = dataset

	self.pose_rep = pose_rep
	self.glob = glob
	self.glob_rot = glob_rot
	self.translation = translation

	self.latent_dim = latent_dim

	self.ff_size = ff_size
	self.num_layers = num_layers
	self.num_heads = num_heads
	self.dropout = dropout

	self.ablation = ablation
	self.activation = activation
	self.clip_dim = clip_dim
	self.action_emb = kargs.get("action_emb", None)
	self.input_feats = self.njoints * self.nfeats

	self.normalize_output = kargs.get("normalize_encoder_output", False)

	self.cond_mode = kargs.get("cond_mode", "no_cond")
	self.cond_mask_prob = kargs.get("cond_mask_prob", 0.0)
	self.mask_frames = kargs.get("mask_frames", False)
	self.arch = arch

	self.emb_policy = kargs.get("emb_policy", "add")

	self.pred_len = kargs.get("pred_len", 0)
	self.context_len = kargs.get("context_len", 0)
	self.total_len = self.pred_len + self.context_len
	self.is_prefix_comp = self.total_len > 0
	self.all_goal_joint_names = kargs.get("all_goal_joint_names", [])

	self.multi_target_cond = kargs.get("multi_target_cond", False)

	self.text_encoder_type = kargs.get("text_encoder_type", "clip")

	# Assert some assumptions we're doing for simplicity
	assert self.arch == "trans_enc"
	assert self.cond_mode == "text"
	assert self.text_encoder_type == "clip"
	assert not self.multi_target_cond
	assert not self.is_prefix_comp
	assert self.emb_policy == "add"
	assert self.data_rep == "hml_vec"

	# Using the Encoder architecture
	transformer_encoder_layer = nn.TransformerEncoderLayer(
	d_model=self.latent_dim,
	nhead=self.num_heads,
	dim_feedforward=self.ff_size,
	dropout=self.dropout,
	activation=self.activation,
	)
	self.seqTransEncoder = nn.TransformerEncoder(
	transformer_encoder_layer, num_layers=self.num_layers
	)

	self.sequence_pos_encoder = PositionalEncoding(
	self.latent_dim, self.dropout, max_len=kargs.get("pos_embed_max_len", 5000)
	)
	self.embed_timestep = TimestepEmbedder(
	self.latent_dim, self.sequence_pos_encoder
	)

	# We'll use CLIP for now
	self.clip_version = clip_version
	self.clip_model = load_and_freeze_clip(clip_version)
	self.encode_text = self.clip_encode_text

	self.embed_text = nn.Linear(self.clip_dim, self.latent_dim)

	# Linear input and output layers
	self.input_process = InputProcess(self.input_feats, self.latent_dim)
	self.output_process = OutputProcess(
	self.input_feats, self.latent_dim, self.njoints, self.nfeats
	)

	self.rot2xyz = Rotation2xyz(device="cpu", dataset=self.dataset)

	def parameters_wo_clip(self):
	return [
	p
	for name, p in self.named_parameters()
	if not name.startswith("clip_model.")
	]

	def mask_cond(self, cond, force_mask=False):
	bs = cond.shape[-2]
	if force_mask:
	return torch.zeros_like(cond)
	elif self.training and self.cond_mask_prob > 0.0:
	mask = torch.bernoulli(
	torch.ones(bs, device=cond.device) * self.cond_mask_prob
	).view(1, bs, 1) # 1-> use null_cond, 0-> use real cond
	return cond * (1.0 - mask)
	else:
	return cond

	def clip_encode_text(self, raw_text):
	# raw_text - list (batch_size length) of strings with input text prompts
	device = next(self.parameters()).device
	max_text_len = (
	20 if self.dataset in ["kit", "humanml", "humanml_with_images"] else None
	) # Specific hardcoding for humanml dataset
	if max_text_len is not None:
	default_context_length = 77
	context_length = max_text_len + 2 # start_token + 20 + end_token
	assert context_length < default_context_length
	texts = clip.tokenize(
	raw_text, context_length=context_length, truncate=True
	).to(
	device
	) # [bs, context_length] # if n_tokens > context_length -> will truncate
	# print('texts', texts.shape)
	zero_pad = torch.zeros(
	[texts.shape[0], default_context_length - context_length],
	dtype=texts.dtype,
	device=texts.device,
	)
	texts = torch.cat([texts, zero_pad], dim=1)
	# print('texts after pad', texts.shape, texts)
	else:
	texts = clip.tokenize(raw_text, truncate=True).to(
	device
	) # [bs, context_length] # if n_tokens > 77 -> will truncate
	return self.clip_model.encode_text(texts).float().unsqueeze(0)

	def motion_to_sequence(self, motion, timesteps, y):
	if "text_embed" not in y:
	clip_encoded_text = self.encode_text(y["text"])
	else:
	clip_encoded_text = y["text_embed"]

	# casting mask for the single-prompt-for-all case
	force_mask = y.get("uncond", False)
	# [1, bs, latent_dim]
	text_embedding = self.embed_text(
	self.mask_cond(clip_encoded_text, force_mask=force_mask)
	)

	# compute the embedding of the timestep + text, z_tk in the paper
	time_embedding = self.embed_timestep(timesteps) # [1, bs, latent_dim]
	embedding = text_embedding + time_embedding # [1, bs, latent_dim]

	# get the motion into latent space
	sequence = self.input_process(motion) # [num_frames, bs, latent_dim]

	sequence_plus_emb = torch.cat(
	(embedding, sequence), dim=0
	) # [num_frames + 1, bs, latent_dim]

	return sequence_plus_emb

	def sequence_to_motion(self, sequence_plus_emb):
	# remove the embedding from the sequence, remove the z_tk from the paper
	sequence = sequence_plus_emb[1:] # [num_frames, bs, latent_dim]

	# get back the motion from the latent space
	motion = self.output_process(
	sequence
	) # [bs, num_joints, num_features, num_frames]

	return motion

	def prepare_mask(self, sequence, device, y, bs):
	# Don't use mask with the generate script
	is_valid_mask = y["mask"].shape[-1] > 1

	if self.mask_frames and is_valid_mask:
	frames_mask = (torch.logical_not(y["mask"][..., : sequence.shape[0]].squeeze(1).squeeze(1))
	.to(device=device))
	step_mask = torch.zeros((bs, 1), dtype=torch.bool, device=device)
	return torch.cat([step_mask, frames_mask], dim=1)
	else:
	return None

	def forward(self, motion, timesteps, y=None):
	"""
	motion: [bs, num_joints, num_features, num_frames]
	timesteps: [bs]
	"""
	sequence = self.motion_to_sequence(motion, timesteps, y)

	# apply positional encoding
	sequence = self.sequence_pos_encoder(
	sequence
	) # [num_frames + 1, bs, latent_dim]

	frames_mask = self.prepare_mask(sequence, motion.device, y, motion.shape[0])

	# actual transformer magic
	sequence = self.seqTransEncoder(sequence, src_key_padding_mask=frames_mask)

	motion = self.sequence_to_motion(sequence)

	return motion

	def _apply(self, fn):
	super()._apply(fn)
	self.rot2xyz.smpl_model._apply(fn)

	def train(self, args, *kwargs):
	super().train(args, *kwargs)
	self.rot2xyz.smpl_model.train(args, *kwargs)


	class PositionalEncoding(nn.Module):
	def __init__(self, d_model, dropout=0.1, max_len=5000):
	super(PositionalEncoding, self).__init__()
	self.dropout = nn.Dropout(p=dropout)

	pe = torch.zeros(max_len, d_model)
	position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
	div_term = torch.exp(
	torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)
	)
	pe[:, 0::2] = torch.sin(position * div_term)
	pe[:, 1::2] = torch.cos(position * div_term)
	pe = pe.unsqueeze(0).transpose(0, 1)

	self.register_buffer("pe", pe)

	def forward(self, x):
	# not used in the final model
	x = x + self.pe[: x.shape[0], :]
	return self.dropout(x)


	class TimestepEmbedder(nn.Module):
	def __init__(self, latent_dim, sequence_pos_encoder):
	super().__init__()
	self.sequence_pos_encoder = sequence_pos_encoder

	self.time_embed = nn.Sequential(
	nn.Linear(latent_dim, latent_dim),
	nn.SiLU(),
	nn.Linear(latent_dim, latent_dim),
	)

	def forward(self, timesteps):
	return self.time_embed(self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2)


	class InputProcess(nn.Module):
	"""
	Applies the linear layer on the motion sequence at the beginning of the MDM

	Also changes the shape
	[bs, num_joints, num_features, num_frames] -> [num_frames, bs, latent_dim]
	"""

	def __init__(self, input_feats, latent_dim):
	super().__init__()
	self.poseEmbedding = nn.Linear(input_feats, latent_dim)

	def forward(self, sequence):
	bs, num_joints, num_features, num_frames = sequence.shape
	sequence = sequence.permute((3, 0, 1, 2)).reshape(
	num_frames, bs, num_joints * num_features
	)
	sequence = self.poseEmbedding(sequence)
	return sequence


	class OutputProcess(nn.Module):
	"""
	Applies the linear layer on the motion sequence at the end of the MDM

	Also changes the shape
	[num_frames, bs, latent_dim] -> [bs, num_joints, num_features, num_frames]
	"""

	def __init__(self, input_feats, latent_dim, num_joints, num_features):
	super().__init__()
	self.input_feats = input_feats
	self.latent_dim = latent_dim
	self.num_joints = num_joints
	self.num_features = num_features
	self.poseFinal = nn.Linear(latent_dim, input_feats)

	def forward(self, sequence):
	num_frames, bs, _ = sequence.shape
	sequence = self.poseFinal(sequence)
	sequence = sequence.reshape(
	num_frames, bs, self.num_joints, self.num_features
	).permute(1, 2, 3, 0)
	return sequence


	def load_and_freeze_clip(clip_version):
	# Must set jit=False for training
	clip_model, clip_preprocess = clip.load(clip_version, device="cpu", jit=False)

	clip_model.eval()

	# Freeze CLIP weights
	clip_model.requires_grad_(False)

	return clip_model