| import os |
| import sys |
|
|
| import numpy as np |
| import torch |
| from diffusers import FlowMatchEulerDiscreteScheduler |
| from diffusers.utils import export_to_video |
| from omegaconf import OmegaConf |
| from PIL import Image |
|
|
| current_file_path = os.path.abspath(__file__) |
| project_roots = [os.path.dirname(current_file_path), os.path.dirname(os.path.dirname(current_file_path)), os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))] |
| for project_root in project_roots: |
| sys.path.insert(0, project_root) if project_root not in sys.path else None |
|
|
| from diffusers.schedulers.scheduling_unipc_multistep import \ |
| UniPCMultistepScheduler |
|
|
| from videox_fun.dist import set_multi_gpus_devices, shard_model |
| from videox_fun.models import (AutoencoderKLHunyuanVideo, CLIPTextModel, CLIPImageProcessor, |
| CLIPTokenizer, HunyuanVideoTransformer3DModel, |
| LlavaForConditionalGeneration, LlamaTokenizerFast) |
| from videox_fun.models.cache_utils import get_teacache_coefficients |
| from videox_fun.pipeline import HunyuanVideoPipeline, HunyuanVideoI2VPipeline |
| from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler |
| from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler |
| from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8, |
| convert_weight_dtype_wrapper, |
| replace_parameters_by_name) |
| from videox_fun.utils.lora_utils import merge_lora, unmerge_lora |
| from videox_fun.utils.utils import (filter_kwargs, get_image_to_video_latent, |
| save_videos_grid) |
| from videox_fun.utils.utils import get_image |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| GPU_memory_mode = "sequential_cpu_offload" |
| |
| |
| |
| |
| ulysses_degree = 1 |
| ring_degree = 1 |
| |
| fsdp_dit = False |
| fsdp_text_encoder = True |
| |
| |
| compile_dit = False |
|
|
| |
| model_name = "models/Diffusion_Transformer/HunyuanVideo-I2V" |
|
|
| |
| sampler_name = "Flow" |
|
|
| |
| transformer_path = None |
| vae_path = None |
| lora_path = None |
|
|
| |
| sample_size = [480, 832] |
| video_length = 81 |
| fps = 16 |
|
|
| |
| |
| weight_dtype = torch.bfloat16 |
| |
| validation_image_start = "asset/1.png" |
|
|
| |
| prompt = "The dog is shaking head. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic." |
| negative_prompt = "The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. " |
| guidance_scale = 1.0 |
| seed = 43 |
| num_inference_steps = 40 |
| lora_weight = 0.55 |
| save_path = "samples/hunyuanvideo-videos-i2v" |
|
|
| device = set_multi_gpus_devices(ulysses_degree, ring_degree) |
|
|
| transformer = HunyuanVideoTransformer3DModel.from_pretrained( |
| os.path.join(model_name, 'transformer'), |
| low_cpu_mem_usage=True, |
| torch_dtype=weight_dtype, |
| ) |
|
|
| if transformer_path is not None: |
| print(f"From checkpoint: {transformer_path}") |
| if transformer_path.endswith("safetensors"): |
| from safetensors.torch import load_file, safe_open |
| state_dict = load_file(transformer_path) |
| else: |
| state_dict = torch.load(transformer_path, map_location="cpu") |
| state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict |
|
|
| m, u = transformer.load_state_dict(state_dict, strict=False) |
| print(f"missing keys: {len(m)}, unexpected keys: {len(u)}") |
|
|
| |
| vae = AutoencoderKLHunyuanVideo.from_pretrained( |
| os.path.join(model_name, 'vae') |
| ).to(weight_dtype) |
|
|
| if vae_path is not None: |
| print(f"From checkpoint: {vae_path}") |
| if vae_path.endswith("safetensors"): |
| from safetensors.torch import load_file, safe_open |
| state_dict = load_file(vae_path) |
| else: |
| state_dict = torch.load(vae_path, map_location="cpu") |
| state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict |
|
|
| m, u = vae.load_state_dict(state_dict, strict=False) |
| print(f"missing keys: {len(m)}, unexpected keys: {len(u)}") |
|
|
| |
| tokenizer = LlamaTokenizerFast.from_pretrained( |
| os.path.join(model_name, 'tokenizer'), |
| ) |
|
|
| |
| text_encoder = LlavaForConditionalGeneration.from_pretrained( |
| os.path.join(model_name, 'text_encoder'), |
| low_cpu_mem_usage=True, |
| torch_dtype=weight_dtype, |
| ) |
|
|
| |
| tokenizer_2 = CLIPTokenizer.from_pretrained( |
| os.path.join(model_name, 'tokenizer_2'), |
| ) |
|
|
| |
| text_encoder_2 = CLIPTextModel.from_pretrained( |
| os.path.join(model_name, 'text_encoder_2'), |
| low_cpu_mem_usage=True, |
| torch_dtype=weight_dtype, |
| ) |
|
|
| |
| image_processor = CLIPImageProcessor.from_pretrained( |
| os.path.join(model_name, 'image_processor'), |
| ) |
|
|
| |
| Chosen_Scheduler = scheduler_dict = { |
| "Flow": FlowMatchEulerDiscreteScheduler, |
| "Flow_Unipc": FlowUniPCMultistepScheduler, |
| "Flow_DPM++": FlowDPMSolverMultistepScheduler, |
| }[sampler_name] |
| scheduler = Chosen_Scheduler.from_pretrained( |
| os.path.join(model_name, 'scheduler'), |
| ) |
|
|
| |
| pipeline = HunyuanVideoI2VPipeline( |
| transformer=transformer, |
| vae=vae, |
| tokenizer=tokenizer, |
| text_encoder=text_encoder, |
| tokenizer_2=tokenizer_2, |
| text_encoder_2=text_encoder_2, |
| scheduler=scheduler, |
| image_processor=image_processor, |
| ) |
| if ulysses_degree > 1 or ring_degree > 1: |
| from functools import partial |
| transformer.enable_multi_gpus_inference() |
| if fsdp_dit: |
| shard_fn = partial(shard_model, device_id=device, param_dtype=weight_dtype, module_to_wrapper=list(transformer.transformer_blocks) + list(transformer.single_transformer_blocks)) |
| pipeline.transformer = shard_fn(pipeline.transformer) |
| print("Add FSDP DIT") |
| if fsdp_text_encoder: |
| shard_fn = partial(shard_model, device_id=device, param_dtype=weight_dtype, module_to_wrapper=text_encoder.language_model.layers) |
| pipeline.text_encoder = shard_fn(pipeline.text_encoder) |
| print("Add FSDP TEXT ENCODER") |
|
|
| if compile_dit: |
| for i in range(len(pipeline.transformer.blocks)): |
| pipeline.transformer.blocks[i] = torch.compile(pipeline.transformer.blocks[i]) |
| print("Add Compile") |
|
|
| if GPU_memory_mode == "sequential_cpu_offload": |
| pipeline.enable_sequential_cpu_offload(device=device) |
| elif GPU_memory_mode == "model_cpu_offload_and_qfloat8": |
| convert_model_weight_to_float8(transformer, exclude_module_name=["x_embedder", "context_embedder", "time_text_embed", "rope", "proj_out"], device=device) |
| convert_weight_dtype_wrapper(transformer, weight_dtype) |
| pipeline.enable_model_cpu_offload(device=device) |
| elif GPU_memory_mode == "model_cpu_offload": |
| pipeline.enable_model_cpu_offload(device=device) |
| elif GPU_memory_mode == "model_full_load_and_qfloat8": |
| convert_model_weight_to_float8(transformer, exclude_module_name=["x_embedder", "context_embedder", "time_text_embed", "rope", "proj_out"], device=device) |
| convert_weight_dtype_wrapper(transformer, weight_dtype) |
| pipeline.to(device=device) |
| else: |
| pipeline.to(device=device) |
|
|
| generator = torch.Generator(device=device).manual_seed(seed) |
|
|
| if lora_path is not None: |
| pipeline = merge_lora(pipeline, lora_path, lora_weight, device=device, dtype=weight_dtype) |
|
|
| with torch.no_grad(): |
| video_length = int((video_length - 1) // vae.config.temporal_compression_ratio * vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1 |
| latent_frames = (video_length - 1) // vae.config.temporal_compression_ratio + 1 |
|
|
| |
| image = get_image(validation_image_start) |
| |
| sample = pipeline( |
| prompt, |
| image = image, |
| num_frames = video_length, |
| negative_prompt = negative_prompt, |
| height = sample_size[0], |
| width = sample_size[1], |
| generator = generator, |
| true_cfg_scale = guidance_scale, |
| num_inference_steps = num_inference_steps, |
| ).videos |
|
|
| if lora_path is not None: |
| pipeline = unmerge_lora(pipeline, lora_path, lora_weight, device=device, dtype=weight_dtype) |
|
|
| def save_results(): |
| if not os.path.exists(save_path): |
| os.makedirs(save_path, exist_ok=True) |
|
|
| index = len([path for path in os.listdir(save_path)]) + 1 |
| prefix = str(index).zfill(8) |
| if video_length == 1: |
| video_path = os.path.join(save_path, prefix + ".png") |
|
|
| image = sample[0, :, 0] |
| image = image.transpose(0, 1).transpose(1, 2) |
| image = (image * 255).numpy().astype(np.uint8) |
| image = Image.fromarray(image) |
| image.save(video_path) |
| else: |
| video_path = os.path.join(save_path, prefix + ".mp4") |
| save_videos_grid(sample, video_path, fps=fps) |
|
|
| if ulysses_degree * ring_degree > 1: |
| import torch.distributed as dist |
| if dist.get_rank() == 0: |
| save_results() |
| else: |
| save_results() |