Spaces:

SteveZh
/

SATA

Running

zzysteve

Initial commit

5221c8c 11 days ago

12.1 kB

	# This code is based on https://github.com/openai/guided-diffusion
	"""
	Generate preprocessed posterior features (z) from text for an external decoder.

	Unlike generate.py, this script does not post-process the generated features.
	It saves the raw z features in a format that external decoders can read.
	"""
	import os
	import numpy as np
	import torch
	from argparse import ArgumentParser
	from utils.fixseed import fixseed
	from utils import dist_util
	from utils.model_util import create_gaussian_diffusion
	from utils.sampler_util import ClassifierFreeSampleModel
	from data_loaders.tensors import collate
	from model.mdm import MDM

	import json


	def sample_z_args():
	"""Parse command-line arguments for z sampling."""
	parser = ArgumentParser(description='Sample z from text using trained MDM model')

	# Model options
	parser.add_argument("--model_path", required=True, type=str,
	help="Path to trained model checkpoint")
	parser.add_argument("--use_ema", action='store_true',
	help="Use EMA model if available")

	# Input options
	parser.add_argument("--text_prompt", default='', type=str,
	help="A single text prompt to generate")
	parser.add_argument("--input_text", default='', type=str,
	help="Path to a text file with prompts (one per line)")

	# Sampling options
	parser.add_argument("--num_samples", default=1, type=int,
	help="Number of samples per prompt")
	parser.add_argument("--num_repetitions", default=1, type=int,
	help="Number of repetitions for each prompt")
	parser.add_argument("--motion_length", default=6.0, type=float,
	help="Motion length in seconds")
	parser.add_argument("--guidance_param", default=2.5, type=float,
	help="Classifier-free guidance scale")

	# Output options
	parser.add_argument("--output_dir", default='', type=str,
	help="Output directory for results")
	parser.add_argument("--save_individual", action='store_true',
	help="Save each sample as individual file")

	# Misc options
	parser.add_argument("--seed", default=10, type=int)
	parser.add_argument("--device", default=0, type=int)

	args = parser.parse_args()
	return args


	def load_model_from_checkpoint(model_path, device, use_ema=False):
	"""Load a model from a checkpoint."""
	# Load args.
	args_path = os.path.join(os.path.dirname(model_path), 'args.json')
	with open(args_path, 'r') as f:
	model_args = json.load(f)

	# Create a simple args object.
	class Args:
	pass
	args = Args()
	for k, v in model_args.items():
	setattr(args, k, v)

	# Determine conditioning mode.
	if hasattr(args, 'unconstrained') and args.unconstrained:
	cond_mode = 'no_cond'
	elif args.dataset in ['kit', 'humanml', 'preprocessed_posterior']:
	cond_mode = 'text'
	else:
	cond_mode = 'action'

	# Get model parameters.
	njoints = getattr(args, 'njoints', 512)
	nfeats = getattr(args, 'nfeats', 1)

	model_kwargs = {
	'modeltype': '',
	'njoints': njoints,
	'nfeats': nfeats,
	'num_actions': 1,
	'translation': True,
	'pose_rep': 'rot6d',
	'glob': True,
	'glob_rot': True,
	'latent_dim': args.latent_dim,
	'ff_size': 1024,
	'num_layers': args.layers,
	'num_heads': 4,
	'dropout': 0.1,
	'activation': 'gelu',
	'data_rep': 'hml_vec',
	'cond_mode': cond_mode,
	'cond_mask_prob': getattr(args, 'cond_mask_prob', 0.1),
	'action_emb': 'tensor',
	'arch': args.arch,
	'emb_trans_dec': getattr(args, 'emb_trans_dec', False),
	'clip_version': 'ViT-B/32',
	'dataset': args.dataset,
	'text_encoder_type': getattr(args, 'text_encoder_type', 'clip'),
	'pos_embed_max_len': getattr(args, 'pos_embed_max_len', 5000),
	'mask_frames': getattr(args, 'mask_frames', False),
	'pred_len': getattr(args, 'pred_len', 0),
	'context_len': getattr(args, 'context_len', 0),
	'emb_policy': 'add',
	'all_goal_joint_names': [],
	'multi_target_cond': getattr(args, 'multi_target_cond', False),
	'multi_encoder_type': getattr(args, 'multi_encoder_type', 'single'),
	'target_enc_layers': getattr(args, 'target_enc_layers', 1),
	'use_rot2xyz': False,
	}

	# Create the model without SMPL-backed xyz conversion; this script saves raw z only.
	model = MDM(**model_kwargs)

	# Load weights.
	state_dict = torch.load(model_path, map_location=device)

	if use_ema and 'model_avg' in state_dict:
	print("Loading EMA model weights...")
	state_dict = state_dict['model_avg']
	elif 'model' in state_dict:
	state_dict = state_dict['model']

	# Remove unused keys.
	keys_to_delete = ['sequence_pos_encoder.pe', 'embed_timestep.sequence_pos_encoder.pe']
	for key in keys_to_delete:
	if key in state_dict:
	del state_dict[key]

	missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
	print(f"Loaded model. Missing keys: {len(missing_keys)}, Unexpected keys: {len(unexpected_keys)}")

	# Create diffusion.
	diffusion = create_gaussian_diffusion(args)

	return model, diffusion, args


	def main():
	args = sample_z_args()
	fixseed(args.seed)

	dist_util.setup_dist(args.device)
	device = dist_util.dev()

	# Load text prompts.
	if args.text_prompt != '':
	texts = [args.text_prompt] * args.num_samples
	elif args.input_text != '':
	assert os.path.exists(args.input_text), f"Input text file not found: {args.input_text}"
	with open(args.input_text, 'r') as f:
	texts = [line.strip() for line in f.readlines() if line.strip()]
	args.num_samples = len(texts)
	else:
	raise ValueError("Must provide either --text_prompt or --input_text")

	print(f"Loaded {len(texts)} text prompts")

	# Load model.
	print(f"Loading model from {args.model_path}...")
	model, diffusion, model_args = load_model_from_checkpoint(
	args.model_path, device, use_ema=args.use_ema
	)
	model.to(device)
	model.eval()

	# Set classifier-free guidance.
	if args.guidance_param != 1:
	model = ClassifierFreeSampleModel(model)

	# Compute frame count.
	fps = 20 # Default FPS
	n_frames = int(args.motion_length * fps)

	# Prepare inputs.
	batch_size = len(texts)
	motion_shape = (batch_size, model.njoints if hasattr(model, 'njoints') else model.model.njoints,
	model.nfeats if hasattr(model, 'nfeats') else model.model.nfeats, n_frames)

	print(f"Motion shape: {motion_shape}")

	# Build model_kwargs.
	collate_args = [{'inp': torch.zeros(n_frames), 'tokens': None, 'lengths': n_frames, 'text': txt}
	for txt in texts]
	_, model_kwargs = collate(collate_args)
	model_kwargs['y'] = {key: val.to(device) if torch.is_tensor(val) else val
	for key, val in model_kwargs['y'].items()}

	# Add guidance scale.
	if args.guidance_param != 1:
	model_kwargs['y']['scale'] = torch.ones(batch_size, device=device) * args.guidance_param

	# Pre-encode text when the model supports it.
	actual_model = model.model if hasattr(model, 'model') else model
	if hasattr(actual_model, 'encode_text') and 'text' in model_kwargs['y']:
	print("Pre-encoding text...")
	model_kwargs['y']['text_embed'] = actual_model.encode_text(model_kwargs['y']['text'])

	# Sample.
	all_samples = []
	all_texts = []
	all_lengths = []

	sample_fn = diffusion.p_sample_loop

	for rep_i in range(args.num_repetitions):
	print(f"\n### Sampling [repetition #{rep_i + 1}/{args.num_repetitions}]")

	sample = sample_fn(
	model,
	motion_shape,
	clip_denoised=False,
	model_kwargs=model_kwargs,
	skip_timesteps=0,
	init_image=None,
	progress=True,
	dump_steps=None,
	noise=None,
	const_noise=False,
	)

	# Do not post-process here; save raw z features directly.
	# sample shape: (batch_size, njoints, nfeats, n_frames)

	all_samples.append(sample.cpu().numpy())
	all_texts.extend(texts)
	all_lengths.extend([n_frames] * batch_size)

	print(f"Generated {len(texts)} samples")

	# Merge all results.
	all_samples = np.concatenate(all_samples, axis=0) # (N, D, 1, T)
	all_lengths = np.array(all_lengths)

	# Convert shape: (N, D, 1, T) -> (N, T, D).
	all_samples = all_samples.squeeze(2) # (N, D, T)
	all_samples = all_samples.transpose(0, 2, 1) # (N, T, D)

	print(f"\nTotal samples: {all_samples.shape[0]}")
	print(f"Sample shape: {all_samples.shape} # (N, T, D)")

	# Save results.
	if args.output_dir == '':
	model_name = os.path.basename(os.path.dirname(args.model_path))
	niter = os.path.basename(args.model_path).replace('model', '').replace('.pt', '')
	args.output_dir = os.path.join(os.path.dirname(args.model_path),
	f'sampled_z_{model_name}_{niter}_seed{args.seed}')

	os.makedirs(args.output_dir, exist_ok=True)

	# Save the main result file in npz format.
	results_path = os.path.join(args.output_dir, 'results.npz')
	print(f"\nSaving results to {results_path}")
	np.savez(results_path,
	z=all_samples, # (N, T, D)
	texts=np.array(all_texts, dtype=str), # (N,) string array
	lengths=all_lengths, # (N,)
	motion_length=np.array(args.motion_length),
	guidance_param=np.array(args.guidance_param),
	model_path=np.array(args.model_path),
	)

	# Optionally save individual files.
	if args.save_individual:
	individual_dir = os.path.join(args.output_dir, 'individual')
	os.makedirs(individual_dir, exist_ok=True)

	for i in range(all_samples.shape[0]):
	# Save npz with all metadata.
	sample_path = os.path.join(individual_dir, f'sample_{i:04d}.npz')
	np.savez(sample_path,
	z=all_samples[i], # (T, D)
	text=np.array(all_texts[i]),
	length=np.array(all_lengths[i]),
	)
	# Save npy with z only.
	z_path = os.path.join(individual_dir, f'z_{i:04d}.npy')
	np.save(z_path, all_samples[i]) # (T, D)

	print(f"Saved {all_samples.shape[0]} individual files to {individual_dir}")

	# Save sampling config.
	config_path = os.path.join(args.output_dir, 'sample_config.json')
	with open(config_path, 'w') as f:
	json.dump({
	'model_path': args.model_path,
	'num_samples': args.num_samples,
	'num_repetitions': args.num_repetitions,
	'motion_length': args.motion_length,
	'guidance_param': args.guidance_param,
	'seed': args.seed,
	'use_ema': args.use_ema,
	'sample_shape': list(all_samples.shape),
	}, f, indent=4)

	print(f"\nDone! Results saved to {args.output_dir}")
	print("\nTo load the results in your decoder:")
	print(" import numpy as np")
	print(f" data = np.load('{results_path}')")
	print(" z = data['z'] # shape: (N, T, D), dtype: float32")
	print(" texts = data['texts'] # shape: (N,), dtype: str")
	print(" lengths = data['lengths'] # shape: (N,), dtype: int")
	print("\n # For individual samples:")
	print(" single_z = z[0] # shape: (T, D)")

	return args.output_dir


	if __name__ == "__main__":
	main()