| |
|
|
| |
| |
|
|
| """ |
| Sample new images from a pre-trained Latte. |
| """ |
| import os |
| import sys |
|
|
| from accelerate import Accelerator |
| from tqdm import tqdm |
|
|
| from opensora.dataset import ae_denorm |
| from opensora.models.ae import ae_channel_config, getae, ae_stride_config |
| from opensora.models.ae.videobase import CausalVQVAEModelWrapper |
| from opensora.models.diffusion import Diffusion_models |
| from opensora.models.diffusion.diffusion import create_diffusion_T as create_diffusion |
| from opensora.models.diffusion.latte.modeling_latte import Latte |
| from opensora.utils.utils import find_model |
|
|
| import torch |
| import argparse |
|
|
| from einops import rearrange |
| import imageio |
|
|
| torch.backends.cuda.matmul.allow_tf32 = True |
| torch.backends.cudnn.allow_tf32 = True |
|
|
| def main(args): |
| |
| |
| torch.set_grad_enabled(False) |
| assert torch.cuda.is_available(), "Training currently requires at least one GPU." |
|
|
| |
| accelerator = Accelerator(mixed_precision=args.mixed_precision) |
| device = accelerator.device |
|
|
| using_cfg = args.cfg_scale > 1.0 |
|
|
| |
| latent_size = (args.image_size // ae_stride_config[args.ae][1], args.image_size // ae_stride_config[args.ae][2]) |
| args.latent_size = latent_size |
| model = Latte.from_pretrained(args.ckpt, subfolder="model").to(device) |
|
|
| model.eval() |
|
|
| model = accelerator.prepare(model) |
|
|
| diffusion = create_diffusion(str(args.num_sampling_steps)) |
| ae = getae(args).to(device) |
| if isinstance(ae, CausalVQVAEModelWrapper): |
| video_length = args.num_frames // ae_stride_config[args.ae][0] + 1 |
| else: |
| video_length = args.num_frames // ae_stride_config[args.ae][0] |
| bar = tqdm(range(args.num_sample)) |
| for i in bar: |
| |
| z = torch.randn(1, model.module.in_channels, video_length, latent_size[0], latent_size[1], device=device) |
|
|
| |
| if using_cfg and args.train_classcondition: |
| z = torch.cat([z, z], 0) |
| y = torch.randint(0, args.num_classes, (1,), device=device) |
| cls_id = str(int(y.detach().cpu())) |
| y_null = torch.tensor([args.num_classes] * 1, device=device) |
| y = torch.cat([y, y_null], dim=0) |
| model_kwargs = dict(class_labels=y, cfg_scale=args.cfg_scale) |
| sample_fn = model.module.forward_with_cfg |
| else: |
| if args.train_classcondition: |
| sample_fn = model.forward |
| y = torch.randint(0, args.num_classes, (1,), device=device) |
| cls_id = str(int(y.detach().cpu())) |
| model_kwargs = dict(class_labels=y) |
| else: |
| sample_fn = model.forward |
| model_kwargs = dict(class_labels=None) |
|
|
| |
| if args.sample_method == 'ddim': |
| samples = diffusion.ddim_sample_loop( |
| sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device |
| ) |
| elif args.sample_method == 'ddpm': |
| samples = diffusion.p_sample_loop( |
| sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device |
| ) |
|
|
| with torch.no_grad(): |
| samples = ae.decode(samples) |
| |
|
|
| if not os.path.exists(args.save_video_path): |
| os.makedirs(args.save_video_path) |
|
|
| video_ = (ae_denorm[args.ae](samples[0]) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1).contiguous() |
| if args.train_classcondition: |
| video_save_path = os.path.join(args.save_video_path, f"sample_{i:03d}_cls" + str(cls_id) + '.mp4') |
| else: |
| video_save_path = os.path.join(args.save_video_path, f"sample_{i:03d}" + '.mp4') |
| print(video_save_path) |
| imageio.mimwrite(video_save_path, video_, fps=args.fps, quality=9) |
| print('save path {}'.format(args.save_video_path)) |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--ckpt", type=str, default="") |
| parser.add_argument("--model", type=str, default='Latte-XL/122') |
| parser.add_argument("--ae", type=str, default='stabilityai/sd-vae-ft-mse') |
| parser.add_argument("--save_video_path", type=str, default="./sample_videos/") |
| parser.add_argument("--fps", type=int, default=10) |
| parser.add_argument("--num_classes", type=int, default=101) |
| parser.add_argument("--num_frames", type=int, default=16) |
| parser.add_argument("--image_size", type=int, default=256) |
| parser.add_argument("--train_classcondition", action="store_true") |
| parser.add_argument("--num_sampling_steps", type=int, default=250) |
| parser.add_argument("--num_sample", type=int, default=1) |
| parser.add_argument("--cfg_scale", type=float, default=1.0) |
| parser.add_argument("--sample_method", type=str, default='ddpm') |
| parser.add_argument("--mixed_precision", type=str, default=None, choices=[None, "fp16", "bf16"]) |
| parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="math") |
| args = parser.parse_args() |
| main(args) |
|
|