Spaces:

SammyLim
/

VideoMaMa

Running on Zero

App Files Files Community

VideoMaMa / pipeline_svd_mask.py

pizb

missing file udpate

06b9d96 4 months ago

raw

history blame contribute delete

47.1 kB

	# pipeline_svd_masked.py

	import inspect
	from dataclasses import dataclass
	from typing import Callable, Dict, List, Optional, Union

	import numpy as np
	import PIL.Image
	import torch
	from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

	from diffusers.image_processor import PipelineImageInput
	from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
	from diffusers.schedulers import EulerDiscreteScheduler
	from diffusers.utils import BaseOutput, logging, replace_example_docstring
	from diffusers.utils.torch_utils import randn_tensor
	from diffusers.video_processor import VideoProcessor
	from diffusers.pipelines.pipeline_utils import DiffusionPipeline

	# Import necessary helpers from the original SVD pipeline
	from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
	_append_dims,
	retrieve_timesteps,
	_resize_with_antialiasing,
	)
	import torch.nn.functional as F
	from einops import rearrange


	logger = logging.get_logger(__name__) # pylint: disable=invalid-name

	EXAMPLE_DOC_STRING = """
	Examples:
	```py
	>>> from pipeline_svd_masked import StableVideoDiffusionPipelineWithMask
	>>> from diffusers.utils import load_image, export_to_video

	>>> # Load your fine-tuned UNet, VAE, etc.
	>>> pipe = StableVideoDiffusionPipelineWithMask.from_pretrained(
	... "path/to/your/finetuned_model", torch_dtype=torch.float16, variant="fp16"
	... )
	>>> pipe.to("cuda")

	>>> # Load the conditioning image and the mask
	>>> image = load_image("path/to/your/conditioning_image.png").resize((1024, 576))
	>>> mask = load_image("path/to/your/mask_image.png").resize((1024, 576))

	>>> # Generate frames
	>>> frames = pipe(
	... image=image,
	... mask_image=mask,
	... num_frames=25,
	... decode_chunk_size=8
	... ).frames[0]

	>>> export_to_video(frames, "generated_video.mp4", fps=7)
	```
	"""


	@dataclass
	class StableVideoDiffusionPipelineOutput(BaseOutput):
	r"""
	Output class for the custom Stable Video Diffusion pipeline.
	Args:
	frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
	List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape
	`(batch_size, num_frames, height, width, num_channels)`.
	"""
	frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]


	class StableVideoDiffusionPipelineWithMask(DiffusionPipeline):
	r"""
	A custom pipeline based on Stable Video Diffusion that accepts an additional mask for conditioning.
	This pipeline is designed to work with a UNet fine-tuned to accept 12 input channels
	(4 for noise, 4 for VAE-encoded condition image, 4 for VAE-encoded mask).
	"""

	model_cpu_offload_seq = "image_encoder->unet->vae"
	_callback_tensor_inputs = ["latents"]

	def __init__(
	self,
	vae: AutoencoderKLTemporalDecoder,
	image_encoder: CLIPVisionModelWithProjection,
	unet: UNetSpatioTemporalConditionModel,
	scheduler: EulerDiscreteScheduler,
	feature_extractor: CLIPImageProcessor,
	):
	super().__init__()

	self.register_modules(
	vae=vae,
	image_encoder=image_encoder,
	unet=unet,
	scheduler=scheduler,
	feature_extractor=feature_extractor,
	)
	self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
	self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)

	def _encode_image(
	self,
	image: PipelineImageInput,
	device: Union[str, torch.device],
	num_videos_per_prompt: int,
	) -> torch.Tensor:
	dtype = next(self.image_encoder.parameters()).dtype

	if not isinstance(image, torch.Tensor):
	image = self.video_processor.pil_to_numpy(image)
	image = self.video_processor.numpy_to_pt(image)

	image = image * 2.0 - 1.0
	image = _resize_with_antialiasing(image, (224, 224))
	image = (image + 1.0) / 2.0

	image = self.feature_extractor(
	images=image,
	do_normalize=True,
	do_center_crop=False,
	do_resize=False,
	do_rescale=False,
	return_tensors="pt",
	).pixel_values

	image = image.to(device=device, dtype=dtype)
	image_embeddings = self.image_encoder(image).image_embeds
	image_embeddings = image_embeddings.unsqueeze(1)

	bs_embed, seq_len, _ = image_embeddings.shape
	image_embeddings = image_embeddings
	# As per your training script, we zero out the embedding
	image_embeddings = torch.zeros_like(image_embeddings)

	return image_embeddings

	def _encode_vae_image(
	self,
	image: torch.Tensor,
	device: Union[str, torch.device],
	num_videos_per_prompt: int,
	):
	image = image.to(device=device, dtype=torch.float16)
	image_latents = self.vae.encode(image).latent_dist.sample()
	image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
	return image_latents

	def _get_add_time_ids(
	self,
	fps: int,
	motion_bucket_id: int,
	noise_aug_strength: float,
	dtype: torch.dtype,
	batch_size: int,
	num_videos_per_prompt: int,
	):
	add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
	passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
	expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
	if expected_add_embed_dim != passed_add_embed_dim:
	raise ValueError(
	f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created."
	)
	add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
	add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
	return add_time_ids

	def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
	latents = latents.flatten(0, 1).to(dtype=torch.float16)
	latents = 1 / self.vae.config.scaling_factor * latents
	frames = []
	for i in range(0, latents.shape[0], decode_chunk_size):
	num_frames_in = latents[i: i + decode_chunk_size].shape[0]
	frame = self.vae.decode(latents[i: i + decode_chunk_size], num_frames=num_frames_in).sample
	frames.append(frame)
	frames = torch.cat(frames, dim=0)
	frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
	frames = frames.float()
	return frames

	def check_inputs(self, image, height, width):
	if (
	not isinstance(image, torch.Tensor)
	and not isinstance(image, PIL.Image.Image)
	and not isinstance(image, list)
	):
	raise ValueError(f"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is {type(image)}")
	if height % 8 != 0 or width % 8 != 0:
	raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

	def prepare_latents(
	self,
	batch_size: int,
	num_frames: int,
	height: int,
	width: int,
	dtype: torch.dtype,
	device: Union[str, torch.device],
	generator: torch.Generator,
	latents: Optional[torch.Tensor] = None,
	initial_latents: Optional[torch.Tensor] = None,
	denoising_strength: float = 1.0,
	timestep: Optional[torch.Tensor] = None,
	):
	num_channels_latents = self.unet.config.out_channels
	shape = (
	batch_size,
	num_frames,
	num_channels_latents,
	height // self.vae_scale_factor,
	width // self.vae_scale_factor,
	)

	if initial_latents is not None:
	# Noise is added to the initial latents
	noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
	# Get the initial latents at the given timestep
	latents = self.scheduler.add_noise(initial_latents, noise, timestep)
	else:
	# Standard pure noise generation
	if latents is None:
	latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
	else:
	latents = latents.to(device)
	# Scale the initial noise by the standard deviation required by the scheduler
	latents = latents * self.scheduler.init_noise_sigma

	return latents

	def _encode_video_vae(
	self,
	video_frames: torch.Tensor, # Expects (B, F, C, H, W)
	device: Union[str, torch.device],
	):
	video_frames = video_frames.to(device=device, dtype=self.vae.dtype)
	batch_size, num_frames = video_frames.shape[:2]

	# Reshape for VAE encoding
	video_frames_reshaped = video_frames.reshape(batch_size * num_frames, video_frames.shape[2:]) # (BF, C, H, W)
	latents = self.vae.encode(video_frames_reshaped).latent_dist.sample() # (B*F, C_latent, H_latent, W_latent)

	# Reshape back to video format
	latents = latents.reshape(batch_size, num_frames, *latents.shape[1:]) # (B, F, C_latent, H_latent, W_latent)

	return latents

	@torch.no_grad()
	def __call__(
	self,
	image: Union[List[PIL.Image.Image], torch.Tensor],
	mask_image: Union[List[PIL.Image.Image], torch.Tensor],
	alpha_matte_image: Optional[Union[List[PIL.Image.Image], torch.Tensor]] = None,
	denoising_strength: float = 0.7,
	height: int = 576,
	width: int = 1024,
	num_frames: Optional[int] = None,
	num_inference_steps: int = 30,
	sigmas: Optional[List[float]] = None,
	fps: int = 7,
	motion_bucket_id: int = 127,
	noise_aug_strength: float = 0.02,
	decode_chunk_size: Optional[int] = None,
	num_videos_per_prompt: Optional[int] = 1,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.Tensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	mask_noise_strength: float = 0.0,
	):
	height = height or self.unet.config.sample_size * self.vae_scale_factor
	width = width or self.unet.config.sample_size * self.vae_scale_factor

	if num_frames is None:
	if isinstance(image, list):
	num_frames = len(image)
	else:
	num_frames = self.unet.config.num_frames

	decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames

	self.check_inputs(image, height, width)
	self.check_inputs(mask_image, height, width)
	if alpha_matte_image:
	self.check_inputs(alpha_matte_image, height, width)

	batch_size = 1
	device = self._execution_device
	dtype = self.unet.dtype

	image_for_clip = image[0] if isinstance(image, list) else image[0]
	image_embeddings = self._encode_image(image_for_clip, device, num_videos_per_prompt)

	fps = fps - 1

	image_tensor = self.video_processor.preprocess(image, height=height, width=width).to(device).unsqueeze(0)
	mask_tensor = self.video_processor.preprocess(mask_image, height=height, width=width).to(device).unsqueeze(0)

	noise = randn_tensor(image_tensor.shape, generator=generator, device=device, dtype=dtype)
	image_tensor = image_tensor + noise_aug_strength * noise

	conditional_latents = self._encode_video_vae(image_tensor, device)
	conditional_latents = conditional_latents / self.vae.config.scaling_factor

	if self.unet.config.in_channels == 12:
	mask_latents = self._encode_video_vae(mask_tensor, device)
	mask_latents = mask_latents / self.vae.config.scaling_factor
	elif self.unet.config.in_channels == 9:
	mask_tensor_gray = mask_tensor.mean(dim=2, keepdim=True)
	binarized_mask = (mask_tensor_gray > 0.0).to(dtype)
	b, f, c, h, w = binarized_mask.shape
	binarized_mask_reshaped = binarized_mask.reshape(b * f, c, h, w)
	target_size = (height // self.vae_scale_factor, width // self.vae_scale_factor)
	interpolated_mask = F.interpolate(
	binarized_mask_reshaped,
	size=target_size,
	mode='nearest',
	)
	mask_latents = interpolated_mask.reshape(b, f, *interpolated_mask.shape[1:])
	else:
	raise ValueError(f"Unsupported number of UNet input channels: {self.unet.config.in_channels}.")

	if mask_noise_strength > 0.0:
	mask_noise = randn_tensor(mask_latents.shape, generator=generator, device=device, dtype=dtype)
	mask_latents = mask_latents + mask_noise_strength * mask_noise

	added_time_ids = self._get_add_time_ids(
	fps, motion_bucket_id, noise_aug_strength, image_embeddings.dtype, batch_size, num_videos_per_prompt
	)
	added_time_ids = added_time_ids.to(device)

	# --- MODIFIED FOR ALPHA MATTE REFINEMENT ---
	timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, sigmas)

	# self.scheduler.set_timesteps(num_inference_steps, device=device)
	# timesteps = self.scheduler.timesteps
	initial_latents = None

	if alpha_matte_image is not None:
	alpha_matte_tensor = self.video_processor.preprocess(alpha_matte_image, height=height, width=width).to(
	device).unsqueeze(0)
	initial_latents = self._encode_video_vae(alpha_matte_tensor, device)
	initial_latents = initial_latents / self.vae.config.scaling_factor

	# Adjust the number of steps and the timesteps to start from
	t_start = max(num_inference_steps - int(num_inference_steps * denoising_strength), 0)
	timesteps = timesteps[t_start:]
	# We need the first timestep to add the correct amount of noise
	start_timestep = timesteps[0]
	else:
	start_timestep = timesteps[0] # Not used, but for clarity

	latents = self.prepare_latents(
	batch_size * num_videos_per_prompt,
	num_frames,
	height,
	width,
	dtype,
	device,
	generator,
	latents,
	initial_latents=initial_latents,
	denoising_strength=denoising_strength,
	timestep=start_timestep if initial_latents is not None else None,
	)

	num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
	self._num_timesteps = len(timesteps)

	with self.progress_bar(total=len(timesteps)) as progress_bar:
	for i, t in enumerate(timesteps):
	latent_model_input = self.scheduler.scale_model_input(latents, t)
	latent_model_input = torch.cat([latent_model_input, conditional_latents, mask_latents], dim=2)

	noise_pred = self.unet(
	latent_model_input, t, encoder_hidden_states=image_embeddings, added_time_ids=added_time_ids,
	return_dict=False
	)[0]

	latents = self.scheduler.step(noise_pred, t, latents).prev_sample

	if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
	progress_bar.update()

	frames = self.decode_latents(latents, num_frames, decode_chunk_size)
	frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)

	self.maybe_free_model_hooks()

	if not return_dict:
	return frames
	return StableVideoDiffusionPipelineOutput(frames=frames)


	class StableVideoDiffusionPipelineOnestepWithMask(DiffusionPipeline):
	r"""
	A custom pipeline based on Stable Video Diffusion that accepts an additional mask for conditioning.
	This pipeline is designed to work with a UNet fine-tuned to accept 12 input channels
	(4 for noise, 4 for VAE-encoded condition image, 4 for VAE-encoded mask).
	"""

	model_cpu_offload_seq = "image_encoder->unet->vae"
	_callback_tensor_inputs = ["latents"]

	def __init__(
	self,
	vae: AutoencoderKLTemporalDecoder,
	image_encoder: CLIPVisionModelWithProjection,
	unet: UNetSpatioTemporalConditionModel,
	scheduler: EulerDiscreteScheduler,
	feature_extractor: CLIPImageProcessor,
	):
	super().__init__()

	self.register_modules(
	vae=vae,
	image_encoder=image_encoder,
	unet=unet,
	scheduler=scheduler,
	feature_extractor=feature_extractor,
	)
	self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
	self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)

	def _encode_image(
	self,
	image: PipelineImageInput,
	device: Union[str, torch.device],
	num_videos_per_prompt: int,
	) -> torch.Tensor:
	dtype = next(self.image_encoder.parameters()).dtype

	if not isinstance(image, torch.Tensor):
	image = self.video_processor.pil_to_numpy(image)
	image = self.video_processor.numpy_to_pt(image)

	image = image * 2.0 - 1.0
	image = _resize_with_antialiasing(image, (224, 224))
	image = (image + 1.0) / 2.0

	image = self.feature_extractor(
	images=image,
	do_normalize=True,
	do_center_crop=False,
	do_resize=False,
	do_rescale=False,
	return_tensors="pt",
	).pixel_values

	image = image.to(device=device, dtype=dtype)
	image_embeddings = self.image_encoder(image).image_embeds
	image_embeddings = image_embeddings.unsqueeze(1)

	bs_embed, seq_len, _ = image_embeddings.shape
	image_embeddings = image_embeddings
	# As per your training script, we zero out the embedding
	image_embeddings = torch.zeros_like(image_embeddings)

	return image_embeddings

	def _encode_vae_image(
	self,
	image: torch.Tensor,
	device: Union[str, torch.device],
	num_videos_per_prompt: int,
	):
	image = image.to(device=device, dtype=torch.float16)
	image_latents = self.vae.encode(image).latent_dist.sample()
	image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
	return image_latents

	def _get_add_time_ids(
	self,
	fps: int,
	motion_bucket_id: int,
	noise_aug_strength: float,
	dtype: torch.dtype,
	batch_size: int,
	num_videos_per_prompt: int,
	):
	add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
	passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
	expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
	if expected_add_embed_dim != passed_add_embed_dim:
	raise ValueError(
	f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created."
	)
	add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
	add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
	return add_time_ids

	def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
	latents = latents.flatten(0, 1).to(dtype=torch.float16)
	latents = 1 / self.vae.config.scaling_factor * latents
	frames = []
	for i in range(0, latents.shape[0], decode_chunk_size):
	num_frames_in = latents[i: i + decode_chunk_size].shape[0]
	frame = self.vae.decode(latents[i: i + decode_chunk_size], num_frames=num_frames_in).sample
	frames.append(frame)
	frames = torch.cat(frames, dim=0)
	frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
	frames = frames.float()
	return frames

	def check_inputs(self, image, height, width):
	if (
	not isinstance(image, torch.Tensor)
	and not isinstance(image, PIL.Image.Image)
	and not isinstance(image, list)
	):
	raise ValueError(f"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is {type(image)}")
	if height % 8 != 0 or width % 8 != 0:
	raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

	def prepare_latents(
	self,
	batch_size: int,
	num_frames: int,
	height: int,
	width: int,
	dtype: torch.dtype,
	device: Union[str, torch.device],
	generator: torch.Generator,
	latents: Optional[torch.Tensor] = None,
	):
	# The number of channels for the initial noise is based on the UNet's out_channels
	num_channels_latents = self.unet.config.out_channels
	shape = (
	batch_size,
	num_frames,
	num_channels_latents,
	height // self.vae_scale_factor,
	width // self.vae_scale_factor,
	)
	if isinstance(generator, list) and len(generator) != batch_size:
	raise ValueError(f"batch size {batch_size} must match the length of the generators {len(generator)}.")

	if latents is None:
	latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
	else:
	latents = latents.to(device)

	latents = latents * self.scheduler.init_noise_sigma
	return latents

	def _encode_video_vae(
	self,
	video_frames: torch.Tensor, # Expects (B, F, C, H, W)
	device: Union[str, torch.device],
	):
	video_frames = video_frames.to(device=device, dtype=self.vae.dtype)
	batch_size, num_frames = video_frames.shape[:2]

	# Reshape for VAE encoding
	video_frames_reshaped = video_frames.reshape(batch_size * num_frames, video_frames.shape[2:]) # (BF, C, H, W)
	latents = self.vae.encode(video_frames_reshaped).latent_dist.sample() # (B*F, C_latent, H_latent, W_latent)

	# Reshape back to video format
	latents = latents.reshape(batch_size, num_frames, *latents.shape[1:]) # (B, F, C_latent, H_latent, W_latent)

	return latents

	@torch.no_grad()
	def __call__(
	self,
	image: Union[List[PIL.Image.Image], torch.Tensor],
	mask_image: Union[List[PIL.Image.Image], torch.Tensor],
	height: int = 576,
	width: int = 1024,
	num_frames: Optional[int] = None,
	fps: int = 7,
	motion_bucket_id: int = 127,
	noise_aug_strength: float = 0.0,
	decode_chunk_size: Optional[int] = None,
	num_videos_per_prompt: Optional[int] = 1,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.Tensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	mask_noise_strength: float = 0.0,
	):
	height = height or self.unet.config.sample_size * self.vae_scale_factor
	width = width or self.unet.config.sample_size * self.vae_scale_factor

	if num_frames is None:
	if isinstance(image, list):
	num_frames = len(image)
	else:
	num_frames = self.unet.config.num_frames

	decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames

	self.check_inputs(image, height, width)
	self.check_inputs(mask_image, height, width)
	if isinstance(image, list) and isinstance(mask_image, list):
	if len(image) != len(mask_image):
	raise ValueError("`image` and `mask_image` must have the same number of frames.")
	if num_frames != len(image):
	logger.warning(
	f"Mismatch between `num_frames` ({num_frames}) and number of input images ({len(image)}). Using {len(image)}.")
	num_frames = len(image)

	batch_size = 1
	device = self._execution_device
	dtype = self.unet.dtype

	image_for_clip = image[0] if isinstance(image, list) else image[0]
	image_embeddings = self._encode_image(image_for_clip, device, num_videos_per_prompt)

	fps = fps - 1

	image_tensor = self.video_processor.preprocess(image, height=height, width=width).to(device).unsqueeze(0)
	mask_tensor = self.video_processor.preprocess(mask_image, height=height, width=width).to(
	device).unsqueeze(0)

	noise = randn_tensor(image_tensor.shape, generator=generator, device=device, dtype=dtype)
	image_tensor = image_tensor + noise_aug_strength * noise

	conditional_latents = self._encode_video_vae(image_tensor, device)
	conditional_latents = conditional_latents / self.vae.config.scaling_factor

	if self.unet.config.in_channels == 12:
	mask_latents = self._encode_video_vae(mask_tensor, device)
	mask_latents = mask_latents / self.vae.config.scaling_factor
	elif self.unet.config.in_channels == 9:
	mask_tensor_gray = mask_tensor.mean(dim=2, keepdim=True)
	binarized_mask = (mask_tensor_gray > 0.0).to(dtype)
	b, f, c, h, w = binarized_mask.shape
	binarized_mask_reshaped = binarized_mask.reshape(b * f, c, h, w)
	target_size = (height // self.vae_scale_factor, width // self.vae_scale_factor)
	interpolated_mask = F.interpolate(
	binarized_mask_reshaped,
	size=target_size,
	mode='nearest',
	)
	mask_latents = interpolated_mask.reshape(b, f, *interpolated_mask.shape[1:])
	else:
	raise ValueError(
	f"Unsupported number of UNet input channels: {self.unet.config.in_channels}. "
	"This pipeline only supports 9 (for interpolated mask) or 12 (for VAE mask)."
	)

	if mask_noise_strength > 0.0:
	mask_noise = randn_tensor(mask_latents.shape, generator=generator, device=device, dtype=dtype)
	mask_latents = mask_latents + mask_noise_strength * mask_noise

	added_time_ids = self._get_add_time_ids(
	fps, motion_bucket_id, noise_aug_strength, image_embeddings.dtype, batch_size, num_videos_per_prompt
	)
	added_time_ids = added_time_ids.to(device)

	# MODIFIED FOR SINGLE-STEP: Prepare initial noise
	num_channels_latents = self.unet.config.out_channels
	shape = (
	batch_size * num_videos_per_prompt,
	num_frames,
	num_channels_latents,
	height // self.vae_scale_factor,
	width // self.vae_scale_factor,
	)
	if latents is None:
	latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)

	# MODIFIED FOR SINGLE-STEP: Set a fixed high timestep
	timestep = torch.tensor([1.0], dtype=dtype, device=device) # Use a high sigma value

	# MODIFIED FOR SINGLE-STEP: Single forward pass
	latent_model_input = torch.cat([latents, conditional_latents, mask_latents], dim=2)

	noise_pred = self.unet(
	latent_model_input, timestep, encoder_hidden_states=image_embeddings, added_time_ids=added_time_ids,
	return_dict=False
	)[0]

	# The model's prediction is the final denoised latent
	denoised_latents = noise_pred

	frames = self.decode_latents(denoised_latents, num_frames, decode_chunk_size)
	frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)

	self.maybe_free_model_hooks()

	if not return_dict:
	return frames
	return StableVideoDiffusionPipelineOutput(frames=frames)


	class StableVideoDiffusionPipelineWithCrossAtnnMask(DiffusionPipeline):
	model_cpu_offload_seq = "image_encoder->unet->vae"
	_callback_tensor_inputs = ["latents"]

	def __init__(
	self,
	vae: AutoencoderKLTemporalDecoder,
	unet: UNetSpatioTemporalConditionModel,
	scheduler: EulerDiscreteScheduler,
	mask_projector: torch.nn.Module,
	# CLIP models are not strictly needed for inference if embeddings are not used
	image_encoder: CLIPVisionModelWithProjection = None,
	feature_extractor: CLIPImageProcessor = None,
	):
	super().__init__()
	self.register_modules(
	vae=vae,
	unet=unet,
	scheduler=scheduler,
	mask_projector=mask_projector,
	image_encoder=image_encoder,
	feature_extractor=feature_extractor,
	)
	self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
	self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)

	def _encode_image_vae(self, image: torch.Tensor, device: Union[str, torch.device]):
	image = image.to(device=device, dtype=self.vae.dtype)
	latent = self.vae.encode(image).latent_dist.sample()
	return latent

	def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int):
	latents = latents.flatten(0, 1).to(dtype=torch.float16)
	latents = 1 / self.vae.config.scaling_factor * latents
	frames = []
	for i in range(0, latents.shape[0], decode_chunk_size):
	frame = self.vae.decode(latents[i: i + decode_chunk_size], num_frames=decode_chunk_size).sample
	frames.append(frame)

	frames = torch.cat(frames, dim=0)
	frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
	frames = frames.float()
	return frames

	def _encode_video_vae(
	self,
	video_frames: torch.Tensor, # Expects (B, F, C, H, W)
	device: Union[str, torch.device],
	):
	video_frames = video_frames.to(device=device, dtype=self.vae.dtype)
	batch_size, num_frames = video_frames.shape[:2]

	# Reshape for VAE encoding
	video_frames_reshaped = video_frames.reshape(batch_size * num_frames, video_frames.shape[2:]) # (BF, C, H, W)
	latents = self.vae.encode(video_frames_reshaped).latent_dist.sample() # (B*F, C_latent, H_latent, W_latent)

	# Reshape back to video format
	latents = latents.reshape(batch_size, num_frames, *latents.shape[1:]) # (B, F, C_latent, H_latent, W_latent)

	return latents

	@torch.no_grad()
	def __call__(
	self,
	image: Union[PIL.Image.Image, torch.Tensor], # Static image for appearance
	mask_image: List[PIL.Image.Image], # Video mask for motion
	height: int = 576,
	width: int = 1024,
	num_frames: Optional[int] = None,
	num_inference_steps: int = 25,
	fps: int = 7,
	motion_bucket_id: int = 127,
	noise_aug_strength: float = 0.0, # Noise is added to latents now
	decode_chunk_size: Optional[int] = 8,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	):
	device = self._execution_device
	dtype = self.unet.dtype
	num_frames = num_frames if num_frames is not None else len(mask_image)
	decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames

	# 1. PREPARE STATIC IMAGE CONDITION
	image_tensor = self.video_processor.preprocess(image, height, width).to(device).unsqueeze(0)
	conditional_latents = self._encode_video_vae(image_tensor, device)
	conditional_latents = conditional_latents / self.vae.config.scaling_factor

	# 2. PREPARE MASK MOTION CONDITION
	mask_tensor = self.video_processor.preprocess(mask_image, height, width)
	if mask_tensor.shape[1] > 1:
	mask_tensor = mask_tensor.mean(dim=1, keepdim=True)

	# Reshape for projector: (T, C, H, W)
	mask_for_projection = rearrange(mask_tensor, "f c h w -> f c h w").to(device, dtype)
	encoder_hidden_states = self.mask_projector(mask_for_projection)
	encoder_hidden_states = encoder_hidden_states.unsqueeze(1) # (T, 1, D)
	# Add batch dimension for UNet
	encoder_hidden_states = encoder_hidden_states.unsqueeze(0) # (1, T, 1, D)
	# The UNet will handle flattening this to (B*T, 1, D) where B=1
	# To be safe, we pass it pre-flattened.
	encoder_hidden_states = rearrange(encoder_hidden_states, "b f s d -> (b f) s d")

	# 3. PREPARE LATENTS
	shape = (1, num_frames, self.unet.config.out_channels, height // self.vae_scale_factor,
	width // self.vae_scale_factor)
	latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
	if noise_aug_strength > 0:
	latents += noise_aug_strength * randn_tensor(latents.shape, generator=generator, device=device,
	dtype=dtype)
	latents = latents * self.scheduler.init_noise_sigma

	# 4. GET ADDED TIME IDS
	# For pipeline, batch size is 1
	added_time_ids = [fps - 1, motion_bucket_id, 0.0] # noise_aug_strength for add_time_ids is 0 for inference
	added_time_ids = torch.tensor([added_time_ids], dtype=dtype, device=device)

	# 5. DENOISING LOOP
	self.scheduler.set_timesteps(num_inference_steps, device=device)
	timesteps = self.scheduler.timesteps

	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for t in timesteps:
	latent_model_input = self.scheduler.scale_model_input(latents, t)
	unet_input = torch.cat([latent_model_input, conditional_latents], dim=2)

	noise_pred = self.unet(
	unet_input, t, encoder_hidden_states=encoder_hidden_states, added_time_ids=added_time_ids
	).sample

	latents = self.scheduler.step(noise_pred, t, latents).prev_sample
	progress_bar.update()

	# 6. DECODE
	frames = self.decode_latents(latents, num_frames, decode_chunk_size)
	frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)

	if not return_dict:
	return (frames,)
	return StableVideoDiffusionPipelineOutput(frames=frames)


	# pipeline.py

	import torch
	import torch.nn.functional as F
	from PIL import Image
	from einops import rearrange
	from torchvision import transforms
	from diffusers import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
	from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection


	class VideoInferencePipeline:
	"""
	A reusable pipeline for single-step video diffusion inference.

	This class encapsulates the models and the core inference logic,
	separating it from data loading and saving, which can vary between tasks.
	"""

	def __init__(self, base_model_path: str, unet_checkpoint_path: str, device: str = "cuda",
	weight_dtype: torch.dtype = torch.float16):
	"""
	Loads all necessary models into memory.

	Args:
	base_model_path (str): Path to the base Stable Video Diffusion model.
	unet_checkpoint_path (str): Path to the fine-tuned UNet checkpoint.
	device (str): The device to run models on ('cuda' or 'cpu').
	weight_dtype (torch.dtype): The precision for model weights (float16 or bfloat16).
	"""
	print("--- Initializing Inference Pipeline and Loading Models ---")
	self.device = torch.device(device if torch.cuda.is_available() else "cpu")
	self.weight_dtype = weight_dtype

	# Load models from pretrained paths
	try:
	self.feature_extractor = CLIPImageProcessor.from_pretrained(base_model_path, subfolder="feature_extractor")
	self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(base_model_path,
	subfolder="image_encoder",
	variant="fp16")
	self.vae = AutoencoderKLTemporalDecoder.from_pretrained(base_model_path, subfolder="vae", variant="fp16")
	self.unet = UNetSpatioTemporalConditionModel.from_pretrained(unet_checkpoint_path, subfolder="unet")
	except Exception as e:
	raise IOError(f"Fatal error loading models: {e}")

	# Move models to the specified device and set to evaluation mode
	self.image_encoder.to(self.device, dtype=self.weight_dtype).eval()
	self.vae.to(self.device, dtype=self.weight_dtype).eval()
	self.unet.to(self.device, dtype=self.weight_dtype).eval()

	print(f"--- Models Loaded Successfully on {self.device} ---")

	def run(self, cond_frames, mask_frames, seed=42, mask_cond_mode="vae", fps=7, motion_bucket_id=127,
	noise_aug_strength=0.0):
	"""
	Runs the core inference process on a sequence of conditioning and mask frames.

	Args:
	cond_frames (list[Image.Image]): List of PIL images for conditioning.
	mask_frames (list[Image.Image]): List of PIL images for the masks.
	seed (int): Random seed for generation.
	mask_cond_mode (str): How the mask is conditioned ("vae" or "interpolate").
	fps (int): Frames per second to condition the model with.
	motion_bucket_id (int): Motion bucket ID for conditioning.
	noise_aug_strength (float): Noise augmentation strength.

	Returns:
	list[Image.Image]: A list of the generated video frames as PIL Images.
	"""
	# --- 1. Prepare Tensors ---
	cond_video_tensor = self._pil_to_tensor(cond_frames).to(self.device)
	mask_video_tensor = self._pil_to_tensor(mask_frames).to(self.device)

	if mask_video_tensor.shape[2] != 3:
	mask_video_tensor = mask_video_tensor.repeat(1, 1, 3, 1, 1)

	with torch.no_grad():
	# --- 2. Get CLIP Image Embeddings ---
	first_frame_tensor = cond_video_tensor[:, 0, :, :, :]
	pixel_values_for_clip = self._resize_with_antialiasing(first_frame_tensor, (224, 224))
	pixel_values_for_clip = ((pixel_values_for_clip + 1.0) / 2.0).clamp(0, 1)
	pixel_values = self.feature_extractor(images=pixel_values_for_clip, return_tensors="pt").pixel_values
	image_embeddings = self.image_encoder(pixel_values.to(self.device, dtype=self.weight_dtype)).image_embeds
	encoder_hidden_states = torch.zeros_like(image_embeddings).unsqueeze(1)

	# --- 3. Prepare Latents ---
	cond_latents = self._tensor_to_vae_latent(cond_video_tensor.to(self.weight_dtype))
	cond_latents = cond_latents / self.vae.config.scaling_factor

	if mask_cond_mode == "vae":
	mask_latents = self._tensor_to_vae_latent(mask_video_tensor.to(self.weight_dtype))
	mask_latents = mask_latents / self.vae.config.scaling_factor
	elif mask_cond_mode == "interpolate":
	target_shape = cond_latents.shape[-2:]
	b, t, c, h, w = mask_video_tensor.shape
	mask_video_reshaped = rearrange(mask_video_tensor, "b t c h w -> (b t) c h w")
	interpolated_mask = F.interpolate(mask_video_reshaped, size=target_shape, mode='bilinear',
	align_corners=False)
	mask_latents = rearrange(interpolated_mask, "(b t) c h w -> b t c h w", b=b)
	else:
	raise ValueError(f"Unknown mask_cond_mode: {mask_cond_mode}")

	# --- 4. Run UNet Single-Step Inference ---
	generator = torch.Generator(device=self.device).manual_seed(seed)
	noisy_latents = torch.randn(cond_latents.shape, generator=generator, device=self.device,
	dtype=self.weight_dtype)
	timesteps = torch.full((1,), 1.0, device=self.device, dtype=torch.long)
	added_time_ids = self._get_add_time_ids(fps, motion_bucket_id, noise_aug_strength, batch_size=1)

	unet_input = torch.cat([noisy_latents, cond_latents, mask_latents], dim=2)
	pred_latents = self.unet(unet_input, timesteps, encoder_hidden_states, added_time_ids=added_time_ids).sample

	# --- 5. Decode Latents to Video Frames ---
	pred_latents = (1 / self.vae.config.scaling_factor) * pred_latents.squeeze(0)

	frames = []
	# Process in chunks to avoid VRAM issues, especially for long videos
	for i in range(0, pred_latents.shape[0], 8):
	chunk = pred_latents[i: i + 8]
	decoded_chunk = self.vae.decode(chunk, num_frames=chunk.shape[0]).sample
	frames.append(decoded_chunk)

	video_tensor = torch.cat(frames, dim=0)
	video_tensor = (video_tensor / 2.0 + 0.5).clamp(0, 1).mean(dim=1, keepdim=True).repeat(1, 3, 1, 1)

	# Return a list of PIL images
	return [transforms.ToPILImage()(frame) for frame in video_tensor]

	def _pil_to_tensor(self, frames: list[Image.Image]):
	"""Converts a list of PIL images to a normalized video tensor."""
	video_tensor = torch.stack([transforms.ToTensor()(f) for f in frames]).unsqueeze(0)
	return video_tensor * 2.0 - 1.0

	def _tensor_to_vae_latent(self, t: torch.Tensor):
	"""Encodes a video tensor into the VAE's latent space."""
	video_length = t.shape[1]
	t = rearrange(t, "b f c h w -> (b f) c h w")
	latents = self.vae.encode(t).latent_dist.sample()
	latents = rearrange(latents, "(b f) c h w -> b f c h w", f=video_length)
	return latents * self.vae.config.scaling_factor

	def _get_add_time_ids(self, fps, motion_bucket_id, noise_aug_strength, batch_size):
	"""Creates the additional time IDs for conditioning the UNet."""
	add_time_ids_list = [fps, motion_bucket_id, noise_aug_strength]
	passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids_list)
	expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
	if expected_add_embed_dim != passed_add_embed_dim:
	raise ValueError(
	f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created.")
	add_time_ids = torch.tensor([add_time_ids_list], dtype=self.weight_dtype, device=self.device)
	return add_time_ids.repeat(batch_size, 1)

	def _resize_with_antialiasing(self, input_tensor, size, interpolation="bicubic", align_corners=True):
	"""
	Resizes a tensor with anti-aliasing for CLIP input, mirroring k-diffusion.
	This is a direct copy of the helper function from your original scripts.
	"""
	h, w = input_tensor.shape[-2:]
	factors = (h / size[0], w / size[1])
	sigmas = (max((factors[0] - 1.0) / 2.0, 0.001), max((factors[1] - 1.0) / 2.0, 0.001))
	ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
	if (ks[0] % 2) == 0: ks = ks[0] + 1, ks[1]
	if (ks[1] % 2) == 0: ks = ks[0], ks[1] + 1

	def _compute_padding(kernel_size):
	computed = [k - 1 for k in kernel_size]
	out_padding = 2 * len(kernel_size) * [0]
	for i in range(len(kernel_size)):
	computed_tmp = computed[-(i + 1)]
	pad_front = computed_tmp // 2
	pad_rear = computed_tmp - pad_front
	out_padding[2 * i + 0] = pad_front
	out_padding[2 * i + 1] = pad_rear
	return out_padding

	def _filter2d(input_tensor, kernel):
	b, c, h, w = input_tensor.shape
	tmp_kernel = kernel[:, None, ...].to(device=input_tensor.device, dtype=input_tensor.dtype)
	tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
	height, width = tmp_kernel.shape[-2:]
	padding_shape = _compute_padding([height, width])
	input_tensor_padded = F.pad(input_tensor, padding_shape, mode="reflect")
	tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
	input_tensor_padded = input_tensor_padded.view(-1, tmp_kernel.size(0), input_tensor_padded.size(-2),
	input_tensor_padded.size(-1))
	output = F.conv2d(input_tensor_padded, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
	return output.view(b, c, h, w)

	def _gaussian(window_size, sigma):
	if isinstance(sigma, float):
	sigma = torch.tensor([[sigma]])
	x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(
	sigma.shape[0], -1)
	if window_size % 2 == 0:
	x = x + 0.5
	gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
	return gauss / gauss.sum(-1, keepdim=True)

	def _gaussian_blur2d(input_tensor, kernel_size, sigma):
	if isinstance(sigma, tuple):
	sigma = torch.tensor([sigma], dtype=input_tensor.dtype)
	else:
	sigma = sigma.to(dtype=input_tensor.dtype)
	ky, kx = int(kernel_size[0]), int(kernel_size[1])
	bs = sigma.shape[0]
	kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
	kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
	out_x = _filter2d(input_tensor, kernel_x[..., None, :])
	return _filter2d(out_x, kernel_y[..., None])

	blurred_input = _gaussian_blur2d(input_tensor, ks, sigmas)
	return F.interpolate(blurred_input, size=size, mode=interpolation, align_corners=align_corners)