ComfyUI-Reactor-Fast-Face-Swap-CPU

Running

App Files Files Community

ComfyUI-Reactor-Fast-Face-Swap-CPU / comfy_extras /nodes_hunyuan.py

V0pr0S

Initial commit for ComfyUI-Reactor-Fast-Face-Swap-CPU

dd7e3fa 4 months ago

raw

history blame contribute delete

7.77 kB

	import nodes
	import node_helpers
	import torch
	import comfy.model_management


	class CLIPTextEncodeHunyuanDiT:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"clip": ("CLIP", ),
	"bert": ("STRING", {"multiline": True, "dynamicPrompts": True}),
	"mt5xl": ("STRING", {"multiline": True, "dynamicPrompts": True}),
	}}
	RETURN_TYPES = ("CONDITIONING",)
	FUNCTION = "encode"

	CATEGORY = "advanced/conditioning"

	def encode(self, clip, bert, mt5xl):
	tokens = clip.tokenize(bert)
	tokens["mt5xl"] = clip.tokenize(mt5xl)["mt5xl"]

	return (clip.encode_from_tokens_scheduled(tokens), )

	class EmptyHunyuanLatentVideo:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": { "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"length": ("INT", {"default": 25, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
	"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
	RETURN_TYPES = ("LATENT",)
	FUNCTION = "generate"

	CATEGORY = "latent/video"

	def generate(self, width, height, length, batch_size=1):
	latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
	return ({"samples":latent}, )

	PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
	"<\|start_header_id\|>system<\|end_header_id\|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
	"1. The main content and theme of the video."
	"2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
	"3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
	"4. background environment, light, style and atmosphere."
	"5. camera angles, movements, and transitions used in the video:<\|eot_id\|>\n\n"
	"<\|start_header_id\|>user<\|end_header_id\|>\n\n{}<\|eot_id\|>"
	"<\|start_header_id\|>assistant<\|end_header_id\|>\n\n"
	)

	class TextEncodeHunyuanVideo_ImageToVideo:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"clip": ("CLIP", ),
	"clip_vision_output": ("CLIP_VISION_OUTPUT", ),
	"prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
	"image_interleave": ("INT", {"default": 2, "min": 1, "max": 512, "tooltip": "How much the image influences things vs the text prompt. Higher number means more influence from the text prompt."}),
	}}
	RETURN_TYPES = ("CONDITIONING",)
	FUNCTION = "encode"

	CATEGORY = "advanced/conditioning"

	def encode(self, clip, clip_vision_output, prompt, image_interleave):
	tokens = clip.tokenize(prompt, llama_template=PROMPT_TEMPLATE_ENCODE_VIDEO_I2V, image_embeds=clip_vision_output.mm_projected, image_interleave=image_interleave)
	return (clip.encode_from_tokens_scheduled(tokens), )

	class HunyuanImageToVideo:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {"positive": ("CONDITIONING", ),
	"vae": ("VAE", ),
	"width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}),
	"length": ("INT", {"default": 53, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}),
	"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}),
	"guidance_type": (["v1 (concat)", "v2 (replace)", "custom"], )
	},
	"optional": {"start_image": ("IMAGE", ),
	}}

	RETURN_TYPES = ("CONDITIONING", "LATENT")
	RETURN_NAMES = ("positive", "latent")
	FUNCTION = "encode"

	CATEGORY = "conditioning/video_models"

	def encode(self, positive, vae, width, height, length, batch_size, guidance_type, start_image=None):
	latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
	out_latent = {}

	if start_image is not None:
	start_image = comfy.utils.common_upscale(start_image[:length, :, :, :3].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)

	concat_latent_image = vae.encode(start_image)
	mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
	mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0

	if guidance_type == "v1 (concat)":
	cond = {"concat_latent_image": concat_latent_image, "concat_mask": mask}
	elif guidance_type == "v2 (replace)":
	cond = {'guiding_frame_index': 0}
	latent[:, :, :concat_latent_image.shape[2]] = concat_latent_image
	out_latent["noise_mask"] = mask
	elif guidance_type == "custom":
	cond = {"ref_latent": concat_latent_image}

	positive = node_helpers.conditioning_set_values(positive, cond)

	out_latent["samples"] = latent
	return (positive, out_latent)

	class EmptyHunyuanImageLatent:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": { "width": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
	"height": ("INT", {"default": 2048, "min": 64, "max": nodes.MAX_RESOLUTION, "step": 32}),
	"batch_size": ("INT", {"default": 1, "min": 1, "max": 4096})}}
	RETURN_TYPES = ("LATENT",)
	FUNCTION = "generate"

	CATEGORY = "latent"

	def generate(self, width, height, batch_size=1):
	latent = torch.zeros([batch_size, 64, height // 32, width // 32], device=comfy.model_management.intermediate_device())
	return ({"samples":latent}, )

	class HunyuanRefinerLatent:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {"positive": ("CONDITIONING", ),
	"negative": ("CONDITIONING", ),
	"latent": ("LATENT", ),
	"noise_augmentation": ("FLOAT", {"default": 0.10, "min": 0.0, "max": 1.0, "step": 0.01}),
	}}

	RETURN_TYPES = ("CONDITIONING", "CONDITIONING", "LATENT")
	RETURN_NAMES = ("positive", "negative", "latent")

	FUNCTION = "execute"

	def execute(self, positive, negative, latent, noise_augmentation):
	latent = latent["samples"]
	positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
	negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": latent, "noise_augmentation": noise_augmentation})
	out_latent = {}
	out_latent["samples"] = torch.zeros([latent.shape[0], 32, latent.shape[-3], latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device())
	return (positive, negative, out_latent)


	NODE_CLASS_MAPPINGS = {
	"CLIPTextEncodeHunyuanDiT": CLIPTextEncodeHunyuanDiT,
	"TextEncodeHunyuanVideo_ImageToVideo": TextEncodeHunyuanVideo_ImageToVideo,
	"EmptyHunyuanLatentVideo": EmptyHunyuanLatentVideo,
	"HunyuanImageToVideo": HunyuanImageToVideo,
	"EmptyHunyuanImageLatent": EmptyHunyuanImageLatent,
	"HunyuanRefinerLatent": HunyuanRefinerLatent,
	}