yongqiang

initialize this repo

ba96580 3 months ago

10.8 kB

	import os
	import sys

	import numpy as np
	import torch
	from diffusers import FlowMatchEulerDiscreteScheduler
	from diffusers.utils import export_to_video
	from omegaconf import OmegaConf
	from PIL import Image

	current_file_path = os.path.abspath(__file__)
	project_roots = [os.path.dirname(current_file_path), os.path.dirname(os.path.dirname(current_file_path)), os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))]
	for project_root in project_roots:
	sys.path.insert(0, project_root) if project_root not in sys.path else None

	from diffusers.schedulers.scheduling_unipc_multistep import \
	UniPCMultistepScheduler

	from videox_fun.dist import set_multi_gpus_devices, shard_model
	from videox_fun.models import (AutoencoderKLHunyuanVideo, CLIPTextModel, CLIPImageProcessor,
	CLIPTokenizer, HunyuanVideoTransformer3DModel,
	LlavaForConditionalGeneration, LlamaTokenizerFast)
	from videox_fun.models.cache_utils import get_teacache_coefficients
	from videox_fun.pipeline import HunyuanVideoPipeline, HunyuanVideoI2VPipeline
	from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
	from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
	from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8,
	convert_weight_dtype_wrapper,
	replace_parameters_by_name)
	from videox_fun.utils.lora_utils import merge_lora, unmerge_lora
	from videox_fun.utils.utils import (filter_kwargs, get_image_to_video_latent,
	save_videos_grid)
	from videox_fun.utils.utils import get_image

	# GPU memory mode, which can be chosen in [model_full_load, model_full_load_and_qfloat8, model_cpu_offload, model_cpu_offload_and_qfloat8, sequential_cpu_offload].
	# model_full_load means that the entire model will be moved to the GPU.
	#
	# model_full_load_and_qfloat8 means that the entire model will be moved to the GPU,
	# and the transformer model has been quantized to float8, which can save more GPU memory.
	#
	# model_cpu_offload means that the entire model will be moved to the CPU after use, which can save some GPU memory.
	#
	# model_cpu_offload_and_qfloat8 indicates that the entire model will be moved to the CPU after use,
	# and the transformer model has been quantized to float8, which can save more GPU memory.
	#
	# sequential_cpu_offload means that each layer of the model will be moved to the CPU after use,
	# resulting in slower speeds but saving a large amount of GPU memory.
	GPU_memory_mode = "sequential_cpu_offload"
	# Multi GPUs config
	# Please ensure that the product of ulysses_degree and ring_degree equals the number of GPUs used.
	# For example, if you are using 8 GPUs, you can set ulysses_degree = 2 and ring_degree = 4.
	# If you are using 1 GPU, you can set ulysses_degree = 1 and ring_degree = 1.
	ulysses_degree = 1
	ring_degree = 1
	# Use FSDP to save more GPU memory in multi gpus.
	fsdp_dit = False
	fsdp_text_encoder = True
	# Compile will give a speedup in fixed resolution and need a little GPU memory.
	# The compile_dit is not compatible with the fsdp_dit and sequential_cpu_offload.
	compile_dit = False

	# model path
	model_name = "models/Diffusion_Transformer/HunyuanVideo-I2V"

	# Choose the sampler in "Flow", "Flow_Unipc", "Flow_DPM++"
	sampler_name = "Flow"

	# Load pretrained model if need
	transformer_path = None
	vae_path = None
	lora_path = None

	# Other params
	sample_size = [480, 832]
	video_length = 81
	fps = 16

	# Use torch.float16 if GPU does not support torch.bfloat16
	# ome graphics cards, such as v100, 2080ti, do not support torch.bfloat16
	weight_dtype = torch.bfloat16
	# If you want to generate from text, please set the validation_image_start = None and validation_image_end = None
	validation_image_start = "asset/1.png"

	# prompts
	prompt = "The dog is shaking head. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic."
	negative_prompt = "The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. "
	guidance_scale = 1.0
	seed = 43
	num_inference_steps = 40
	lora_weight = 0.55
	save_path = "samples/hunyuanvideo-videos-i2v"

	device = set_multi_gpus_devices(ulysses_degree, ring_degree)

	transformer = HunyuanVideoTransformer3DModel.from_pretrained(
	os.path.join(model_name, 'transformer'),
	low_cpu_mem_usage=True,
	torch_dtype=weight_dtype,
	)

	if transformer_path is not None:
	print(f"From checkpoint: {transformer_path}")
	if transformer_path.endswith("safetensors"):
	from safetensors.torch import load_file, safe_open
	state_dict = load_file(transformer_path)
	else:
	state_dict = torch.load(transformer_path, map_location="cpu")
	state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict

	m, u = transformer.load_state_dict(state_dict, strict=False)
	print(f"missing keys: {len(m)}, unexpected keys: {len(u)}")

	# Get Vae
	vae = AutoencoderKLHunyuanVideo.from_pretrained(
	os.path.join(model_name, 'vae')
	).to(weight_dtype)

	if vae_path is not None:
	print(f"From checkpoint: {vae_path}")
	if vae_path.endswith("safetensors"):
	from safetensors.torch import load_file, safe_open
	state_dict = load_file(vae_path)
	else:
	state_dict = torch.load(vae_path, map_location="cpu")
	state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict

	m, u = vae.load_state_dict(state_dict, strict=False)
	print(f"missing keys: {len(m)}, unexpected keys: {len(u)}")

	# Get Tokenizer
	tokenizer = LlamaTokenizerFast.from_pretrained(
	os.path.join(model_name, 'tokenizer'),
	)

	# Get Text encoder
	text_encoder = LlavaForConditionalGeneration.from_pretrained(
	os.path.join(model_name, 'text_encoder'),
	low_cpu_mem_usage=True,
	torch_dtype=weight_dtype,
	)

	# Get Tokenizer 2
	tokenizer_2 = CLIPTokenizer.from_pretrained(
	os.path.join(model_name, 'tokenizer_2'),
	)

	# Get Text encoder 2
	text_encoder_2 = CLIPTextModel.from_pretrained(
	os.path.join(model_name, 'text_encoder_2'),
	low_cpu_mem_usage=True,
	torch_dtype=weight_dtype,
	)

	# Get Image Processor
	image_processor = CLIPImageProcessor.from_pretrained(
	os.path.join(model_name, 'image_processor'),
	)

	# Get Scheduler
	Chosen_Scheduler = scheduler_dict = {
	"Flow": FlowMatchEulerDiscreteScheduler,
	"Flow_Unipc": FlowUniPCMultistepScheduler,
	"Flow_DPM++": FlowDPMSolverMultistepScheduler,
	}[sampler_name]
	scheduler = Chosen_Scheduler.from_pretrained(
	os.path.join(model_name, 'scheduler'),
	)

	# Get Pipeline
	pipeline = HunyuanVideoI2VPipeline(
	transformer=transformer,
	vae=vae,
	tokenizer=tokenizer,
	text_encoder=text_encoder,
	tokenizer_2=tokenizer_2,
	text_encoder_2=text_encoder_2,
	scheduler=scheduler,
	image_processor=image_processor,
	)
	if ulysses_degree > 1 or ring_degree > 1:
	from functools import partial
	transformer.enable_multi_gpus_inference()
	if fsdp_dit:
	shard_fn = partial(shard_model, device_id=device, param_dtype=weight_dtype, module_to_wrapper=list(transformer.transformer_blocks) + list(transformer.single_transformer_blocks))
	pipeline.transformer = shard_fn(pipeline.transformer)
	print("Add FSDP DIT")
	if fsdp_text_encoder:
	shard_fn = partial(shard_model, device_id=device, param_dtype=weight_dtype, module_to_wrapper=text_encoder.language_model.layers)
	pipeline.text_encoder = shard_fn(pipeline.text_encoder)
	print("Add FSDP TEXT ENCODER")

	if compile_dit:
	for i in range(len(pipeline.transformer.blocks)):
	pipeline.transformer.blocks[i] = torch.compile(pipeline.transformer.blocks[i])
	print("Add Compile")

	if GPU_memory_mode == "sequential_cpu_offload":
	pipeline.enable_sequential_cpu_offload(device=device)
	elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
	convert_model_weight_to_float8(transformer, exclude_module_name=["x_embedder", "context_embedder", "time_text_embed", "rope", "proj_out"], device=device)
	convert_weight_dtype_wrapper(transformer, weight_dtype)
	pipeline.enable_model_cpu_offload(device=device)
	elif GPU_memory_mode == "model_cpu_offload":
	pipeline.enable_model_cpu_offload(device=device)
	elif GPU_memory_mode == "model_full_load_and_qfloat8":
	convert_model_weight_to_float8(transformer, exclude_module_name=["x_embedder", "context_embedder", "time_text_embed", "rope", "proj_out"], device=device)
	convert_weight_dtype_wrapper(transformer, weight_dtype)
	pipeline.to(device=device)
	else:
	pipeline.to(device=device)

	generator = torch.Generator(device=device).manual_seed(seed)

	if lora_path is not None:
	pipeline = merge_lora(pipeline, lora_path, lora_weight, device=device, dtype=weight_dtype)

	with torch.no_grad():
	video_length = int((video_length - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1
	latent_frames = (video_length - 1) // vae.config.temporal_compression_ratio + 1

	# open
	image = get_image(validation_image_start)

	sample = pipeline(
	prompt,
	image = image,
	num_frames = video_length,
	negative_prompt = negative_prompt,
	height = sample_size[0],
	width = sample_size[1],
	generator = generator,
	true_cfg_scale = guidance_scale,
	num_inference_steps = num_inference_steps,
	).videos

	if lora_path is not None:
	pipeline = unmerge_lora(pipeline, lora_path, lora_weight, device=device, dtype=weight_dtype)

	def save_results():
	if not os.path.exists(save_path):
	os.makedirs(save_path, exist_ok=True)

	index = len([path for path in os.listdir(save_path)]) + 1
	prefix = str(index).zfill(8)
	if video_length == 1:
	video_path = os.path.join(save_path, prefix + ".png")

	image = sample[0, :, 0]
	image = image.transpose(0, 1).transpose(1, 2)
	image = (image * 255).numpy().astype(np.uint8)
	image = Image.fromarray(image)
	image.save(video_path)
	else:
	video_path = os.path.join(save_path, prefix + ".mp4")
	save_videos_grid(sample, video_path, fps=fps)

	if ulysses_degree * ring_degree > 1:
	import torch.distributed as dist
	if dist.get_rank() == 0:
	save_results()
	else:
	save_results()