ComfyUI-WanVideoWrapper / skyreels /nodes.py

Mirror from https://github.com/kijai/ComfyUI-WanVideoWrapper

cf812a0 verified 3 months ago

33.1 kB

	import os
	import torch
	import gc
	from ..utils import log, print_memory, fourier_filter, optimized_scale, setup_radial_attention, compile_model
	import math
	from tqdm import tqdm

	from ..wanvideo.modules.model import rope_params
	from ..wanvideo.schedulers.fm_solvers_unipc import FlowUniPCMultistepScheduler
	from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
	from ..custom_linear import remove_lora_from_module, set_lora_params
	from ..wanvideo.schedulers.scheduling_flow_match_lcm import FlowMatchLCMScheduler
	from ..gguf.gguf import set_lora_params_gguf
	from einops import rearrange

	from ..enhance_a_video.globals import disable_enhance

	import comfy.model_management as mm
	from comfy.utils import load_torch_file, ProgressBar, common_upscale
	from comfy.clip_vision import clip_preprocess, ClipVisionModel
	from comfy.cli_args import args, LatentPreviewMethod
	from ..nodes_model_loading import load_weights
	from ..nodes_sampler import offload_transformer

	device = mm.get_torch_device()
	offload_device = mm.unet_offload_device()

	script_directory = os.path.dirname(os.path.abspath(__file__))

	def generate_timestep_matrix(
	num_frames,
	step_template,
	base_num_frames,
	ar_step=5,
	num_pre_ready=0,
	casual_block_size=1,
	shrink_interval_with_mask=False,
	) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, list[tuple]]:
	step_matrix, step_index = [], []
	update_mask, valid_interval = [], []
	num_iterations = len(step_template) + 1
	num_frames_block = num_frames // casual_block_size
	base_num_frames_block = base_num_frames // casual_block_size
	if base_num_frames_block < num_frames_block:
	infer_step_num = len(step_template)
	gen_block = base_num_frames_block
	min_ar_step = infer_step_num / gen_block
	assert ar_step >= min_ar_step, f"ar_step should be at least {math.ceil(min_ar_step)} in your setting"
	# print(num_frames, step_template, base_num_frames, ar_step, num_pre_ready, casual_block_size, num_frames_block, base_num_frames_block)
	step_template = torch.cat(
	[
	torch.tensor([999], dtype=torch.int64, device=step_template.device),
	step_template.long(),
	torch.tensor([0], dtype=torch.int64, device=step_template.device),
	]
	) # to handle the counter in row works starting from 1
	pre_row = torch.zeros(num_frames_block, dtype=torch.long)
	if num_pre_ready > 0:
	pre_row[: num_pre_ready // casual_block_size] = num_iterations

	while torch.all(pre_row >= (num_iterations - 1)) == False:
	new_row = torch.zeros(num_frames_block, dtype=torch.long)
	for i in range(num_frames_block):
	if i == 0 or pre_row[i - 1] >= (
	num_iterations - 1
	): # the first frame or the last frame is completely denoised
	new_row[i] = pre_row[i] + 1
	else:
	new_row[i] = new_row[i - 1] - ar_step
	new_row = new_row.clamp(0, num_iterations)

	update_mask.append(
	(new_row != pre_row) & (new_row != num_iterations)
	) # False: no need to update， True: need to update
	step_index.append(new_row)
	step_matrix.append(step_template[new_row])
	pre_row = new_row

	# for long video we split into several sequences, base_num_frames is set to the model max length (for training)
	terminal_flag = base_num_frames_block
	if shrink_interval_with_mask:
	idx_sequence = torch.arange(num_frames_block, dtype=torch.int64)
	update_mask = update_mask[0]
	update_mask_idx = idx_sequence[update_mask]
	last_update_idx = update_mask_idx[-1].item()
	terminal_flag = last_update_idx + 1
	# for i in range(0, len(update_mask)):
	for curr_mask in update_mask:
	if terminal_flag < num_frames_block and curr_mask[terminal_flag]:
	terminal_flag += 1
	valid_interval.append((max(terminal_flag - base_num_frames_block, 0), terminal_flag))

	step_update_mask = torch.stack(update_mask, dim=0)
	step_index = torch.stack(step_index, dim=0)
	step_matrix = torch.stack(step_matrix, dim=0)

	if casual_block_size > 1:
	step_update_mask = step_update_mask.unsqueeze(-1).repeat(1, 1, casual_block_size).flatten(1).contiguous()
	step_index = step_index.unsqueeze(-1).repeat(1, 1, casual_block_size).flatten(1).contiguous()
	step_matrix = step_matrix.unsqueeze(-1).repeat(1, 1, casual_block_size).flatten(1).contiguous()
	valid_interval = [(s * casual_block_size, e * casual_block_size) for s, e in valid_interval]

	return step_matrix, step_index, step_update_mask, valid_interval

	#region Sampler
	class WanVideoDiffusionForcingSampler:
	@classmethod
	def INPUT_TYPES(s):
	return {
	"required": {
	"model": ("WANVIDEOMODEL",),
	"text_embeds": ("WANVIDEOTEXTEMBEDS", ),
	"image_embeds": ("WANVIDIMAGE_EMBEDS", ),
	"addnoise_condition": ("INT", {"default": 10, "min": 0, "max": 1000, "tooltip": "Improves consistency in long video generation"}),
	"fps": ("FLOAT", {"default": 24.0, "min": 1.0, "max": 120.0, "step": 0.01}),
	"steps": ("INT", {"default": 30, "min": 1}),
	"cfg": ("FLOAT", {"default": 6.0, "min": 0.0, "max": 30.0, "step": 0.01}),
	"shift": ("FLOAT", {"default": 8.0, "min": 0.0, "max": 1000.0, "step": 0.01}),
	"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
	"force_offload": ("BOOLEAN", {"default": True, "tooltip": "Moves the model to the offload device after sampling"}),
	"scheduler": (["unipc", "unipc/beta", "euler", "euler/beta", "lcm", "lcm/beta"],
	{
	"default": 'unipc'
	}),
	},
	"optional": {
	"samples": ("LATENT", {"tooltip": "init Latents to use for video2video process"} ),
	"prefix_samples": ("LATENT", {"tooltip": "prefix latents"} ),
	"denoise_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
	"cache_args": ("CACHEARGS", ),
	"slg_args": ("SLGARGS", ),
	"rope_function": (["default", "comfy"], {"default": "comfy", "tooltip": "Comfy's RoPE implementation doesn't use complex numbers and can thus be compiled, that should be a lot faster when using torch.compile"}),
	"experimental_args": ("EXPERIMENTALARGS", ),
	"unianimate_poses": ("UNIANIMATE_POSE", ),
	}
	}

	RETURN_TYPES = ("LATENT", )
	RETURN_NAMES = ("samples",)
	FUNCTION = "process"
	CATEGORY = "WanVideoWrapper"

	def process(self, model, text_embeds, image_embeds, shift, fps, steps, addnoise_condition, cfg, seed, scheduler,
	force_offload=True, samples=None, prefix_samples=None, denoise_strength=1.0, slg_args=None, rope_function="default", cache_args=None, teacache_args=None,
	experimental_args=None, unianimate_poses=None):
	#assert not (context_options and teacache_args), "Context options cannot currently be used together with teacache."
	patcher = model
	model = model.model
	transformer = model.diffusion_model

	dtype = model["base_dtype"]
	weight_dtype = model["weight_dtype"]
	fp8_matmul = model["fp8_matmul"]
	gguf_reader = model["gguf_reader"]
	control_lora = model["control_lora"]

	transformer_options = patcher.model_options.get("transformer_options", None)
	merge_loras = transformer_options["merge_loras"]

	block_swap_args = transformer_options.get("block_swap_args", None)
	if block_swap_args is not None:
	transformer.use_non_blocking = block_swap_args.get("use_non_blocking", False)
	transformer.blocks_to_swap = block_swap_args.get("blocks_to_swap", 0)
	transformer.vace_blocks_to_swap = block_swap_args.get("vace_blocks_to_swap", 0)
	transformer.prefetch_blocks = block_swap_args.get("prefetch_blocks", 0)
	transformer.block_swap_debug = block_swap_args.get("block_swap_debug", False)
	transformer.offload_img_emb = block_swap_args.get("offload_img_emb", False)
	transformer.offload_txt_emb = block_swap_args.get("offload_txt_emb", False)

	is_5b = transformer.out_dim == 48
	vae_upscale_factor = 16 if is_5b else 8

	# Load weights
	if transformer.patched_linear and gguf_reader is None:
	load_weights(patcher.model.diffusion_model, patcher.model["sd"], weight_dtype, base_dtype=dtype, transformer_load_device=device, block_swap_args=block_swap_args)

	if gguf_reader is not None: #handle GGUF
	load_weights(transformer, patcher.model["sd"], base_dtype=dtype, transformer_load_device=device, patcher=patcher, gguf=True, reader=gguf_reader, block_swap_args=block_swap_args)
	set_lora_params_gguf(transformer, patcher.patches)
	transformer.patched_linear = True
	elif len(patcher.patches) != 0 and transformer.patched_linear: #handle patched linear layers (unmerged loras, fp8 scaled)
	log.info(f"Using {len(patcher.patches)} LoRA weight patches for WanVideo model")
	if not merge_loras and fp8_matmul:
	raise NotImplementedError("FP8 matmul with unmerged LoRAs is not supported")
	set_lora_params(transformer, patcher.patches)
	else:
	remove_lora_from_module(transformer) #clear possible unmerged lora weights

	transformer.lora_scheduling_enabled = transformer_options.get("lora_scheduling_enabled", False)

	#torch.compile
	if model["auto_cpu_offload"] is False:
	transformer = compile_model(transformer, model["compile_args"])

	steps = int(steps/denoise_strength)

	timesteps = None
	if 'unipc' in scheduler:
	sample_scheduler = FlowUniPCMultistepScheduler(shift=shift)
	sample_scheduler.set_timesteps(steps, device=device, shift=shift, use_beta_sigmas=('beta' in scheduler))
	elif 'euler' in scheduler:
	sample_scheduler = FlowMatchEulerDiscreteScheduler(shift=shift, use_beta_sigmas=(scheduler == 'euler/beta'))
	sample_scheduler.set_timesteps(steps, device=device)
	elif 'lcm' in scheduler:
	sample_scheduler = FlowMatchLCMScheduler(shift=shift, use_beta_sigmas=(scheduler == 'lcm/beta'))
	sample_scheduler.set_timesteps(steps, device=device)


	init_timesteps = sample_scheduler.timesteps

	if denoise_strength < 1.0:
	steps = int(steps * denoise_strength)
	timesteps = timesteps[-(steps + 1):]

	seed_g = torch.Generator(device=torch.device("cpu"))
	seed_g.manual_seed(seed)

	clip_fea, clip_fea_neg = None, None
	vace_data, vace_context, vace_scale = None, None, None

	image_cond = image_embeds.get("image_embeds", None)

	target_shape = image_embeds.get("target_shape", None)
	if target_shape is None:
	raise ValueError("Empty image embeds must be provided for T2V (Text to Video")

	has_ref = image_embeds.get("has_ref", False)
	vace_context = image_embeds.get("vace_context", None)
	vace_scale = image_embeds.get("vace_scale", None)
	if not isinstance(vace_scale, list):
	vace_scale = [vace_scale] * (steps+1)
	vace_start_percent = image_embeds.get("vace_start_percent", 0.0)
	vace_end_percent = image_embeds.get("vace_end_percent", 1.0)
	vace_seqlen = image_embeds.get("vace_seq_len", None)

	vace_additional_embeds = image_embeds.get("additional_vace_inputs", [])
	if vace_context is not None:
	vace_data = [
	{"context": vace_context,
	"scale": vace_scale,
	"start": vace_start_percent,
	"end": vace_end_percent,
	"seq_len": vace_seqlen
	}
	]
	if len(vace_additional_embeds) > 0:
	for i in range(len(vace_additional_embeds)):
	if vace_additional_embeds[i].get("has_ref", False):
	has_ref = True
	vace_scale = vace_additional_embeds[i]["vace_scale"]
	if not isinstance(vace_scale, list):
	vace_scale = [vace_scale] * (steps+1)
	vace_data.append({
	"context": vace_additional_embeds[i]["vace_context"],
	"scale": vace_scale,
	"start": vace_additional_embeds[i]["vace_start_percent"],
	"end": vace_additional_embeds[i]["vace_end_percent"],
	"seq_len": vace_additional_embeds[i]["vace_seq_len"]
	})

	noise = torch.randn(
	target_shape[0],
	target_shape[1] + 1 if has_ref else target_shape[1],
	target_shape[2],
	target_shape[3],
	dtype=torch.float32,
	device=torch.device("cpu"),
	generator=seed_g)

	latent_video_length = noise.shape[1]
	seq_len = math.ceil((noise.shape[2] * noise.shape[3]) / 4 * noise.shape[1])



	if samples is not None:
	input_samples = samples["samples"].squeeze(0).to(noise)
	if input_samples.shape[1] != noise.shape[1]:
	input_samples = torch.cat([input_samples[:, :1].repeat(1, noise.shape[1] - input_samples.shape[1], 1, 1), input_samples], dim=1)
	original_image = input_samples.to(device)
	if denoise_strength < 1.0:
	latent_timestep = timesteps[:1].to(noise)
	noise = noise * latent_timestep / 1000 + (1 - latent_timestep / 1000) * input_samples

	mask = samples.get("mask", None)
	if mask is not None:
	if mask.shape[2] != noise.shape[1]:
	mask = torch.cat([torch.zeros(1, noise.shape[0], noise.shape[1] - mask.shape[2], noise.shape[2], noise.shape[3]), mask], dim=2)

	latents = noise.to(device)

	fps_embeds = None
	if hasattr(transformer, "fps_embedding"):
	fps = round(fps, 2)
	log.info(f"Model has fps embedding, using {fps} fps")
	fps_embeds = [fps]
	fps_embeds = [0 if i == 16 else 1 for i in fps_embeds]

	prefix_video = prefix_samples["samples"].to(noise) if prefix_samples is not None else None
	prefix_video_latent_length = prefix_video.shape[2] if prefix_video is not None else 0
	if prefix_video is not None:
	log.info(f"Prefix video of length: {prefix_video_latent_length}")
	latents[:, :prefix_video_latent_length] = prefix_video[0]
	#base_num_frames = (base_num_frames - 1) // 4 + 1 if base_num_frames is not None else latent_video_length
	base_num_frames=latent_video_length

	ar_step = 0
	causal_block_size = 1
	step_matrix, _, step_update_mask, valid_interval = generate_timestep_matrix(
	latent_video_length, init_timesteps, base_num_frames, ar_step, prefix_video_latent_length, causal_block_size
	)

	sample_schedulers = []
	for _ in range(latent_video_length):
	if 'unipc' in scheduler:
	sample_scheduler = FlowUniPCMultistepScheduler(shift=shift)
	sample_scheduler.set_timesteps(steps, device=device, shift=shift, use_beta_sigmas=('beta' in scheduler))
	elif 'euler' in scheduler:
	sample_scheduler = FlowMatchEulerDiscreteScheduler(shift=shift)
	sample_scheduler.set_timesteps(steps, device=device)
	elif 'lcm' in scheduler:
	sample_scheduler = FlowMatchLCMScheduler(shift=shift, use_beta_sigmas=(scheduler == 'lcm/beta'))
	sample_scheduler.set_timesteps(steps, device=device)

	sample_schedulers.append(sample_scheduler)
	sample_schedulers_counter = [0] * latent_video_length

	unianim_data = None
	if unianimate_poses is not None:
	transformer.dwpose_embedding.to(device)
	transformer.randomref_embedding_pose.to(device)
	dwpose_data = unianimate_poses["pose"]
	dwpose_data = transformer.dwpose_embedding(
	(torch.cat([dwpose_data[:,:,:1].repeat(1,1,3,1,1), dwpose_data], dim=2)
	).to(device)).to(model["dtype"])
	log.info(f"UniAnimate pose embed shape: {dwpose_data.shape}")
	if dwpose_data.shape[2] > latent_video_length:
	log.warning(f"UniAnimate pose embed length {dwpose_data.shape[2]} is longer than the video length {latent_video_length}, truncating")
	dwpose_data = dwpose_data[:,:, :latent_video_length]
	elif dwpose_data.shape[2] < latent_video_length:
	log.warning(f"UniAnimate pose embed length {dwpose_data.shape[2]} is shorter than the video length {latent_video_length}, padding with last pose")
	pad_len = latent_video_length - dwpose_data.shape[2]
	pad = dwpose_data[:,:,:1].repeat(1,1,pad_len,1,1)
	dwpose_data = torch.cat([dwpose_data, pad], dim=2)
	dwpose_data_flat = rearrange(dwpose_data, 'b c f h w -> b (f h w) c').contiguous()

	random_ref_dwpose_data = None
	if image_cond is not None:
	random_ref_dwpose = unianimate_poses.get("ref", None)
	if random_ref_dwpose is not None:
	random_ref_dwpose_data = transformer.randomref_embedding_pose(
	random_ref_dwpose.to(device)
	).unsqueeze(2).to(model["dtype"]) # [1, 20, 104, 60]

	unianim_data = {
	"dwpose": dwpose_data_flat,
	"random_ref": random_ref_dwpose_data.squeeze(0) if random_ref_dwpose_data is not None else None,
	"strength": unianimate_poses["strength"],
	"start_percent": unianimate_poses["start_percent"],
	"end_percent": unianimate_poses["end_percent"]
	}

	disable_enhance() #not sure if this can work, disabling for now to avoid errors if it's enabled by another sampler

	freqs = None
	transformer.rope_embedder.k = None
	transformer.rope_embedder.num_frames = None
	if rope_function=="comfy":
	transformer.rope_embedder.k = 0
	transformer.rope_embedder.num_frames = latent_video_length
	else:
	d = transformer.dim // transformer.num_heads
	freqs = torch.cat([
	rope_params(1024, d - 4 * (d // 6), L_test=latent_video_length, k=0),
	rope_params(1024, 2 * (d // 6)),
	rope_params(1024, 2 * (d // 6))
	],
	dim=1)

	if not isinstance(cfg, list):
	cfg = [cfg] * (steps +1)

	log.info(f"Seq len: {seq_len}")

	pbar = ProgressBar(steps)

	if args.preview_method in [LatentPreviewMethod.Auto, LatentPreviewMethod.Latent2RGB]: #default for latent2rgb
	from latent_preview import prepare_callback
	else:
	from ..latent_preview import prepare_callback #custom for tiny VAE previews
	callback = prepare_callback(patcher, steps)

	#blockswap init
	#blockswap init
	if not transformer.patched_linear:
	if block_swap_args is not None:
	transformer.use_non_blocking = block_swap_args.get("use_non_blocking", False)
	for name, param in transformer.named_parameters():
	if "block" not in name:
	param.data = param.data.to(device)
	if "control_adapter" in name:
	param.data = param.data.to(device)
	elif block_swap_args["offload_txt_emb"] and "txt_emb" in name:
	param.data = param.data.to(offload_device)
	elif block_swap_args["offload_img_emb"] and "img_emb" in name:
	param.data = param.data.to(offload_device)

	transformer.block_swap(
	block_swap_args["blocks_to_swap"] - 1 ,
	block_swap_args["offload_txt_emb"],
	block_swap_args["offload_img_emb"],
	vace_blocks_to_swap = block_swap_args.get("vace_blocks_to_swap", None),
	prefetch_blocks = block_swap_args.get("prefetch_blocks", 0),
	block_swap_debug = block_swap_args.get("block_swap_debug", False),
	)
	elif model["auto_cpu_offload"]:
	for module in transformer.modules():
	if hasattr(module, "offload"):
	module.offload()
	if hasattr(module, "onload"):
	module.onload()
	for block in transformer.blocks:
	block.modulation = torch.nn.Parameter(block.modulation.to(device))
	transformer.head.modulation = torch.nn.Parameter(transformer.head.modulation.to(device))
	else:
	transformer.to(device)

	# Initialize Cache if enabled
	transformer.enable_teacache = transformer.enable_magcache = False
	if teacache_args is not None: #for backward compatibility on old workflows
	cache_args = teacache_args
	if cache_args is not None:
	transformer.cache_device = cache_args["cache_device"]
	if cache_args["cache_type"] == "TeaCache":
	log.info(f"TeaCache: Using cache device: {transformer.cache_device}")
	transformer.teacache_state.clear_all()
	transformer.enable_teacache = True
	transformer.rel_l1_thresh = cache_args["rel_l1_thresh"]
	transformer.teacache_start_step = cache_args["start_step"]
	transformer.teacache_end_step = len(init_timesteps)-1 if cache_args["end_step"] == -1 else cache_args["end_step"]
	transformer.teacache_use_coefficients = cache_args["use_coefficients"]
	transformer.teacache_mode = cache_args["mode"]
	elif cache_args["cache_type"] == "MagCache":
	log.info(f"MagCache: Using cache device: {transformer.cache_device}")
	transformer.magcache_state.clear_all()
	transformer.enable_magcache = True
	transformer.magcache_start_step = cache_args["start_step"]
	transformer.magcache_end_step = len(init_timesteps)-1 if cache_args["end_step"] == -1 else cache_args["end_step"]
	transformer.magcache_thresh = cache_args["magcache_thresh"]
	transformer.magcache_K = cache_args["magcache_K"]

	if slg_args is not None:
	transformer.slg_blocks = slg_args["blocks"]
	transformer.slg_start_percent = slg_args["start_percent"]
	transformer.slg_end_percent = slg_args["end_percent"]
	else:
	transformer.slg_blocks = None

	self.teacache_state = [None, None]
	self.teacache_state_source = [None, None]
	self.teacache_states_context = []

	if transformer.attention_mode == "radial_sage_attention":
	setup_radial_attention(transformer, transformer_options, latents, seq_len, latent_video_length)


	use_cfg_zero_star, use_fresca = False, False
	if experimental_args is not None:
	video_attention_split_steps = experimental_args.get("video_attention_split_steps", [])
	if video_attention_split_steps:
	transformer.video_attention_split_steps = [int(x.strip()) for x in video_attention_split_steps.split(",")]
	else:
	transformer.video_attention_split_steps = []
	use_zero_init = experimental_args.get("use_zero_init", True)
	use_cfg_zero_star = experimental_args.get("cfg_zero_star", False)
	zero_star_steps = experimental_args.get("zero_star_steps", 0)

	use_fresca = experimental_args.get("use_fresca", False)
	if use_fresca:
	fresca_scale_low = experimental_args.get("fresca_scale_low", 1.0)
	fresca_scale_high = experimental_args.get("fresca_scale_high", 1.25)
	fresca_freq_cutoff = experimental_args.get("fresca_freq_cutoff", 20)

	#region model pred
	def predict_with_cfg(z, cfg_scale, positive_embeds, negative_embeds, timestep, idx, image_cond=None, clip_fea=None,
	vace_data=None, unianim_data=None, teacache_state=None):
	with torch.autocast(device_type=mm.get_autocast_device(device), dtype=dtype, enabled=("fp8" in model["quantization"])):

	if use_cfg_zero_star and (idx <= zero_star_steps) and use_zero_init:
	return latent_model_input*0, None

	nonlocal patcher
	current_step_percentage = idx / len(init_timesteps)
	control_lora_enabled = False

	image_cond_input = image_cond

	base_params = {
	'seq_len': seq_len,
	'device': device,
	'freqs': freqs,
	't': timestep,
	'current_step': idx,
	'control_lora_enabled': control_lora_enabled,
	'vace_data': vace_data,
	'unianim_data': unianim_data,
	'fps_embeds': fps_embeds,
	"nag_params": text_embeds.get("nag_params", {}),
	"nag_context": text_embeds.get("nag_prompt_embeds", None),
	}

	batch_size = 1

	if not math.isclose(cfg_scale, 1.0) and len(positive_embeds) > 1:
	negative_embeds = negative_embeds * len(positive_embeds)


	#cond
	noise_pred_cond, teacache_state_cond = transformer(
	[z], context=positive_embeds, y=[image_cond_input] if image_cond_input is not None else None,
	clip_fea=clip_fea, is_uncond=False, current_step_percentage=current_step_percentage,
	pred_id=teacache_state[0] if teacache_state else None,
	**base_params
	)
	noise_pred_cond = noise_pred_cond[0].to(intermediate_device)
	if math.isclose(cfg_scale, 1.0):
	if use_fresca:
	noise_pred_cond = fourier_filter(
	noise_pred_cond,
	scale_low=fresca_scale_low,
	scale_high=fresca_scale_high,
	freq_cutoff=fresca_freq_cutoff,
	)
	return noise_pred_cond, [teacache_state_cond]
	#uncond
	noise_pred_uncond, teacache_state_uncond = transformer(
	[z], context=negative_embeds, clip_fea=clip_fea_neg if clip_fea_neg is not None else clip_fea,
	y=[image_cond_input] if image_cond_input is not None else None,
	is_uncond=True, current_step_percentage=current_step_percentage,
	pred_id=teacache_state[1] if teacache_state else None,
	**base_params
	)
	noise_pred_uncond = noise_pred_uncond[0].to(intermediate_device)

	#cfg

	#https://github.com/WeichenFan/CFG-Zero-star/
	if use_cfg_zero_star:
	alpha = optimized_scale(
	noise_pred_cond.view(batch_size, -1),
	noise_pred_uncond.view(batch_size, -1)
	).view(batch_size, 1, 1, 1)
	else:
	alpha = 1.0

	#https://github.com/WikiChao/FreSca
	if use_fresca:
	filtered_cond = fourier_filter(
	noise_pred_cond - noise_pred_uncond,
	scale_low=fresca_scale_low,
	scale_high=fresca_scale_high,
	freq_cutoff=fresca_freq_cutoff,
	)
	noise_pred = noise_pred_uncond * alpha + cfg_scale * filtered_cond * alpha
	else:
	noise_pred = noise_pred_uncond * alpha + cfg_scale * (noise_pred_cond - noise_pred_uncond * alpha)


	return noise_pred, [teacache_state_cond, teacache_state_uncond]

	log.info(f"Sampling {(latent_video_length-1) * 4 + 1} frames at {latents.shape[3]8}x{latents.shape[2]8} with {steps} steps")

	intermediate_device = device

	#clear memory before sampling
	mm.unload_all_models()
	mm.soft_empty_cache()
	gc.collect()
	try:
	torch.cuda.reset_peak_memory_stats(device)
	except:
	pass

	#region main loop start
	for i, timestep_i in enumerate(tqdm(step_matrix)):
	update_mask_i = step_update_mask[i]
	valid_interval_i = valid_interval[i]
	valid_interval_start, valid_interval_end = valid_interval_i
	timestep = timestep_i[None, valid_interval_start:valid_interval_end].clone()
	latent_model_input = latents[:, valid_interval_start:valid_interval_end, :, :].clone()
	if addnoise_condition > 0 and valid_interval_start < prefix_video_latent_length:
	noise_factor = 0.001 * addnoise_condition
	timestep_for_noised_condition = addnoise_condition
	latent_model_input[:, valid_interval_start:prefix_video_latent_length] = (
	latent_model_input[:, valid_interval_start:prefix_video_latent_length] * (1.0 - noise_factor)
	+ torch.randn_like(latent_model_input[:, valid_interval_start:prefix_video_latent_length])
	* noise_factor
	)
	timestep[:, valid_interval_start:prefix_video_latent_length] = timestep_for_noised_condition


	#print("timestep", timestep)
	noise_pred, self.teacache_state = predict_with_cfg(
	latent_model_input.to(dtype),
	cfg[i],
	text_embeds["prompt_embeds"],
	text_embeds["negative_prompt_embeds"],
	timestep, i, image_cond, clip_fea, unianim_data=unianim_data, vace_data=vace_data,
	teacache_state=self.teacache_state)

	for idx in range(valid_interval_start, valid_interval_end):
	if update_mask_i[idx].item():
	latents[:, idx] = sample_schedulers[idx].step(
	noise_pred[:, idx - valid_interval_start],
	timestep_i[idx],
	latents[:, idx],
	return_dict=False,
	generator=seed_g,
	)[0]
	sample_schedulers_counter[idx] += 1

	x0 = latents.unsqueeze(0)
	if callback is not None:
	callback_latent = (latent_model_input - noise_pred.to(timestep_i[idx].device) * timestep_i[idx] / 1000).detach().permute(1,0,2,3)
	callback(i, callback_latent, None, steps)
	else:
	pbar.update(1)

	if teacache_args is not None:
	states = transformer.teacache_state.states
	state_names = {
	0: "conditional",
	1: "unconditional"
	}
	for pred_id, state in states.items():
	name = state_names.get(pred_id, f"prediction_{pred_id}")
	if 'skipped_steps' in state:
	log.info(f"TeaCache skipped: {len(state['skipped_steps'])} {name} steps: {state['skipped_steps']}")
	transformer.teacache_state.clear_all()

	if force_offload:
	if not model["auto_cpu_offload"]:
	offload_transformer(transformer)

	try:
	print_memory(device)
	torch.cuda.reset_peak_memory_stats(device)
	except:
	pass

	return ({
	"samples": x0.cpu(),
	}, )

	NODE_CLASS_MAPPINGS = {
	"WanVideoDiffusionForcingSampler": WanVideoDiffusionForcingSampler,
	}
	NODE_DISPLAY_NAME_MAPPINGS = {
	"WanVideoDiffusionForcingSampler": "WanVideo Diffusion Forcing Sampler",
	}