| import nodes |
| import node_helpers |
| import torch |
| import comfy.model_management |
|
|
|
|
| class CLIPTextEncodeHunyuanDiT: |
| @classmethod |
| def INPUT_TYPES(s): |
| return {"required": { |
| "clip": ("CLIP", ), |
| "bert": ("STRING", {"multiline": True, "dynamicPrompts": True}), |
| "mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}), |
| }} |
| RETURN_TYPES = ("CONDITIONING",) |
| FUNCTION = "encode" |
|
|
| CATEGORY = "advanced/conditioning" |
|
|
| def encode(self, clip, bert, mt5xl): |
| tokens = clip.tokenize(bert) |
| tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"] |
|
|
| return (clip.encode_from_tokens_scheduled(tokens), ) |
|
|
| class EmptyHunyuanLatentVideo: |
| @classmethod |
| def INPUT_TYPES(s): |
| return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), |
| "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), |
| "length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), |
| "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} |
| RETURN_TYPES = ("LATENT",) |
| FUNCTION = "generate" |
|
|
| CATEGORY = "latent/video" |
|
|
| def generate(self, width, height, length, batch_size=1): |
| latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) |
| return ({"samples":latent}, ) |
|
|
| PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = ( |
| "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: " |
| "1. The main content and theme of the video." |
| "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects." |
| "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects." |
| "4. background environment, light, style and atmosphere." |
| "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n" |
| "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>" |
| "<|start_header_id|>assistant<|end_header_id|>\n\n" |
| ) |
|
|
| class TextEncodeHunyuanVideo_ImageToVideo: |
| @classmethod |
| def INPUT_TYPES(s): |
| return {"required": { |
| "clip": ("CLIP", ), |
| "clip_vision_output": ("CLIP_VISION_OUTPUT", ), |
| "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}), |
| "image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}), |
| }} |
| RETURN_TYPES = ("CONDITIONING",) |
| FUNCTION = "encode" |
|
|
| CATEGORY = "advanced/conditioning" |
|
|
| def encode(self, clip, clip_vision_output, prompt, image_interleave): |
| tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave) |
| return (clip.encode_from_tokens_scheduled(tokens), ) |
|
|
| class HunyuanImageToVideo: |
| @classmethod |
| def INPUT_TYPES(s): |
| return {"required": {"positive": ("CONDITIONING", ), |
| "vae": ("VAE", ), |
| "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), |
| "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), |
| "length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), |
| "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), |
| "guidance_type": (["v1 (concat)", "v2 (replace)", "custom"], ) |
| }, |
| "optional": {"start_image": ("IMAGE", ), |
| }} |
|
|
| RETURN_TYPES = ("CONDITIONING", "LATENT") |
| RETURN_NAMES = ("positive", "latent") |
| FUNCTION = "encode" |
|
|
| CATEGORY = "conditioning/video_models" |
|
|
| def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None): |
| latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) |
| out_latent = {} |
|
|
| if start_image is not None: |
| start_image = comfy.utils.common_upscale(start_image[:length, :, :, :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1) |
|
|
| concat_latent_image = vae.encode(start_image) |
| mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype) |
| mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0 |
|
|
| if guidance_type == "v1 (concat)": |
| cond = {"concat_latent_image": concat_latent_image, "concat_mask": mask} |
| elif guidance_type == "v2 (replace)": |
| cond = {'guiding_frame_index': 0} |
| latent[:, :, :concat_latent_image.shape[2]] = concat_latent_image |
| out_latent["noise_mask"] = mask |
| elif guidance_type == "custom": |
| cond = {"ref_latent": concat_latent_image} |
|
|
| positive = node_helpers.conditioning_set_values(positive, cond) |
|
|
| out_latent["samples"] = latent |
| return (positive, out_latent) |
|
|
| class EmptyHunyuanImageLatent: |
| @classmethod |
| def INPUT_TYPES(s): |
| return {"required": { "width": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), |
| "height": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}), |
| "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}} |
| RETURN_TYPES = ("LATENT",) |
| FUNCTION = "generate" |
|
|
| CATEGORY = "latent" |
|
|
| def generate(self, width, height, batch_size=1): |
| latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device()) |
| return ({"samples":latent}, ) |
|
|
| class HunyuanRefinerLatent: |
| @classmethod |
| def INPUT_TYPES(s): |
| return {"required": {"positive": ("CONDITIONING", ), |
| "negative": ("CONDITIONING", ), |
| "latent": ("LATENT", ), |
| "noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}), |
| }} |
|
|
| RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT") |
| RETURN_NAMES = ("positive", "negative", "latent") |
|
|
| FUNCTION = "execute" |
|
|
| def execute(self, positive, negative, latent, noise_augmentation): |
| latent = latent["samples"] |
| positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) |
| negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation}) |
| out_latent = {} |
| out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) |
| return (positive, negative, out_latent) |
|
|
|
|
| NODE_CLASS_MAPPINGS = { |
| "CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT, |
| "TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo, |
| "EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo, |
| "HunyuanImageToVideo": HunyuanImageToVideo, |
| "EmptyHunyuanImageLatent": EmptyHunyuanImageLatent, |
| "HunyuanRefinerLatent": HunyuanRefinerLatent, |
| } |
|
|