Spaces:

SammyLim
/

VideoMaMa

Running on Zero

App Files Files Community

pizb commited on Jan 8

Commit

06b9d96

1 Parent(s): d33e75e

missing file udpate

Browse files

Files changed (1) hide show

pipeline_svd_mask.py +1042 -0

pipeline_svd_mask.py ADDED Viewed

	@@ -0,0 +1,1042 @@

+# pipeline_svd_masked.py
+import inspect
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
+from diffusers.schedulers import EulerDiscreteScheduler
+from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+# Import necessary helpers from the original SVD pipeline
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    _append_dims,
+    retrieve_timesteps,
+    _resize_with_antialiasing,
+)
+import torch.nn.functional as F
+from einops import rearrange
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from pipeline_svd_masked import StableVideoDiffusionPipelineWithMask
+        >>> from diffusers.utils import load_image, export_to_video
+        >>> # Load your fine-tuned UNet, VAE, etc.
+        >>> pipe = StableVideoDiffusionPipelineWithMask.from_pretrained(
+        ...     "path/to/your/finetuned_model", torch_dtype=torch.float16, variant="fp16"
+        ... )
+        >>> pipe.to("cuda")
+        >>> # Load the conditioning image and the mask
+        >>> image = load_image("path/to/your/conditioning_image.png").resize((1024, 576))
+        >>> mask = load_image("path/to/your/mask_image.png").resize((1024, 576))
+        >>> # Generate frames
+        >>> frames = pipe(
+        ...     image=image,
+        ...     mask_image=mask,
+        ...     num_frames=25,
+        ...     decode_chunk_size=8
+        ... ).frames[0]
+        >>> export_to_video(frames, "generated_video.mp4", fps=7)
+        ```
+"""
+@dataclass
+class StableVideoDiffusionPipelineOutput(BaseOutput):
+    r"""
+    Output class for the custom Stable Video Diffusion pipeline.
+    Args:
+        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.Tensor`]):
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape
+            `(batch_size, num_frames, height, width, num_channels)`.
+    """
+    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.Tensor]
+class StableVideoDiffusionPipelineWithMask(DiffusionPipeline):
+    r"""
+    A custom pipeline based on Stable Video Diffusion that accepts an additional mask for conditioning.
+    This pipeline is designed to work with a UNet fine-tuned to accept 12 input channels
+    (4 for noise, 4 for VAE-encoded condition image, 4 for VAE-encoded mask).
+    """
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+            self,
+            vae: AutoencoderKLTemporalDecoder,
+            image_encoder: CLIPVisionModelWithProjection,
+            unet: UNetSpatioTemporalConditionModel,
+            scheduler: EulerDiscreteScheduler,
+            feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)
+    def _encode_image(
+            self,
+            image: PipelineImageInput,
+            device: Union[str, torch.device],
+            num_videos_per_prompt: int,
+    ) -> torch.Tensor:
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.video_processor.pil_to_numpy(image)
+            image = self.video_processor.numpy_to_pt(image)
+        image = image * 2.0 - 1.0
+        image = _resize_with_antialiasing(image, (224, 224))
+        image = (image + 1.0) / 2.0
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings
+        # As per your training script, we zero out the embedding
+        image_embeddings = torch.zeros_like(image_embeddings)
+        return image_embeddings
+    def _encode_vae_image(
+            self,
+            image: torch.Tensor,
+            device: Union[str, torch.device],
+            num_videos_per_prompt: int,
+    ):
+        image = image.to(device=device, dtype=torch.float16)
+        image_latents = self.vae.encode(image).latent_dist.sample()
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        return image_latents
+    def _get_add_time_ids(
+            self,
+            fps: int,
+            motion_bucket_id: int,
+            noise_aug_strength: float,
+            dtype: torch.dtype,
+            batch_size: int,
+            num_videos_per_prompt: int,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        return add_time_ids
+    def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
+        latents = latents.flatten(0, 1).to(dtype=torch.float16)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i: i + decode_chunk_size].shape[0]
+            frame = self.vae.decode(latents[i: i + decode_chunk_size], num_frames=num_frames_in).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+                not isinstance(image, torch.Tensor)
+                and not isinstance(image, PIL.Image.Image)
+                and not isinstance(image, list)
+        ):
+            raise ValueError(f"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is {type(image)}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+            self,
+            batch_size: int,
+            num_frames: int,
+            height: int,
+            width: int,
+            dtype: torch.dtype,
+            device: Union[str, torch.device],
+            generator: torch.Generator,
+            latents: Optional[torch.Tensor] = None,
+            initial_latents: Optional[torch.Tensor] = None,
+            denoising_strength: float = 1.0,
+            timestep: Optional[torch.Tensor] = None,
+    ):
+        num_channels_latents = self.unet.config.out_channels
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if initial_latents is not None:
+            # Noise is added to the initial latents
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # Get the initial latents at the given timestep
+            latents = self.scheduler.add_noise(initial_latents, noise, timestep)
+        else:
+            # Standard pure noise generation
+            if latents is None:
+                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            else:
+                latents = latents.to(device)
+            # Scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def _encode_video_vae(
+            self,
+            video_frames: torch.Tensor,  # Expects (B, F, C, H, W)
+            device: Union[str, torch.device],
+    ):
+        video_frames = video_frames.to(device=device, dtype=self.vae.dtype)
+        batch_size, num_frames = video_frames.shape[:2]
+        # Reshape for VAE encoding
+        video_frames_reshaped = video_frames.reshape(batch_size * num_frames, *video_frames.shape[2:])  # (B*F, C, H, W)
+        latents = self.vae.encode(video_frames_reshaped).latent_dist.sample()  # (B*F, C_latent, H_latent, W_latent)
+        # Reshape back to video format
+        latents = latents.reshape(batch_size, num_frames, *latents.shape[1:])  # (B, F, C_latent, H_latent, W_latent)
+        return latents
+    @torch.no_grad()
+    def __call__(
+            self,
+            image: Union[List[PIL.Image.Image], torch.Tensor],
+            mask_image: Union[List[PIL.Image.Image], torch.Tensor],
+            alpha_matte_image: Optional[Union[List[PIL.Image.Image], torch.Tensor]] = None,
+            denoising_strength: float = 0.7,
+            height: int = 576,
+            width: int = 1024,
+            num_frames: Optional[int] = None,
+            num_inference_steps: int = 30,
+            sigmas: Optional[List[float]] = None,
+            fps: int = 7,
+            motion_bucket_id: int = 127,
+            noise_aug_strength: float = 0.02,
+            decode_chunk_size: Optional[int] = None,
+            num_videos_per_prompt: Optional[int] = 1,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.Tensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            mask_noise_strength: float = 0.0,
+    ):
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        if num_frames is None:
+            if isinstance(image, list):
+                num_frames = len(image)
+            else:
+                num_frames = self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        self.check_inputs(image, height, width)
+        self.check_inputs(mask_image, height, width)
+        if alpha_matte_image:
+            self.check_inputs(alpha_matte_image, height, width)
+        batch_size = 1
+        device = self._execution_device
+        dtype = self.unet.dtype
+        image_for_clip = image[0] if isinstance(image, list) else image[0]
+        image_embeddings = self._encode_image(image_for_clip, device, num_videos_per_prompt)
+        fps = fps - 1
+        image_tensor = self.video_processor.preprocess(image, height=height, width=width).to(device).unsqueeze(0)
+        mask_tensor = self.video_processor.preprocess(mask_image, height=height, width=width).to(device).unsqueeze(0)
+        noise = randn_tensor(image_tensor.shape, generator=generator, device=device, dtype=dtype)
+        image_tensor = image_tensor + noise_aug_strength * noise
+        conditional_latents = self._encode_video_vae(image_tensor, device)
+        conditional_latents = conditional_latents / self.vae.config.scaling_factor
+        if self.unet.config.in_channels == 12:
+            mask_latents = self._encode_video_vae(mask_tensor, device)
+            mask_latents = mask_latents / self.vae.config.scaling_factor
+        elif self.unet.config.in_channels == 9:
+            mask_tensor_gray = mask_tensor.mean(dim=2, keepdim=True)
+            binarized_mask = (mask_tensor_gray > 0.0).to(dtype)
+            b, f, c, h, w = binarized_mask.shape
+            binarized_mask_reshaped = binarized_mask.reshape(b * f, c, h, w)
+            target_size = (height // self.vae_scale_factor, width // self.vae_scale_factor)
+            interpolated_mask = F.interpolate(
+                binarized_mask_reshaped,
+                size=target_size,
+                mode='nearest',
+            )
+            mask_latents = interpolated_mask.reshape(b, f, *interpolated_mask.shape[1:])
+        else:
+            raise ValueError(f"Unsupported number of UNet input channels: {self.unet.config.in_channels}.")
+        if mask_noise_strength > 0.0:
+            mask_noise = randn_tensor(mask_latents.shape, generator=generator, device=device, dtype=dtype)
+            mask_latents = mask_latents + mask_noise_strength * mask_noise
+        added_time_ids = self._get_add_time_ids(
+            fps, motion_bucket_id, noise_aug_strength, image_embeddings.dtype, batch_size, num_videos_per_prompt
+        )
+        added_time_ids = added_time_ids.to(device)
+        # --- MODIFIED FOR ALPHA MATTE REFINEMENT ---
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, None, sigmas)
+        # self.scheduler.set_timesteps(num_inference_steps, device=device)
+        # timesteps = self.scheduler.timesteps
+        initial_latents = None
+        if alpha_matte_image is not None:
+            alpha_matte_tensor = self.video_processor.preprocess(alpha_matte_image, height=height, width=width).to(
+                device).unsqueeze(0)
+            initial_latents = self._encode_video_vae(alpha_matte_tensor, device)
+            initial_latents = initial_latents / self.vae.config.scaling_factor
+            # Adjust the number of steps and the timesteps to start from
+            t_start = max(num_inference_steps - int(num_inference_steps * denoising_strength), 0)
+            timesteps = timesteps[t_start:]
+            # We need the first timestep to add the correct amount of noise
+            start_timestep = timesteps[0]
+        else:
+            start_timestep = timesteps[0]  # Not used, but for clarity
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            latents,
+            initial_latents=initial_latents,
+            denoising_strength=denoising_strength,
+            timestep=start_timestep if initial_latents is not None else None,
+        )
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = self.scheduler.scale_model_input(latents, t)
+                latent_model_input = torch.cat([latent_model_input, conditional_latents, mask_latents], dim=2)
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=image_embeddings, added_time_ids=added_time_ids,
+                    return_dict=False
+                )[0]
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+        frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)
+class StableVideoDiffusionPipelineOnestepWithMask(DiffusionPipeline):
+    r"""
+    A custom pipeline based on Stable Video Diffusion that accepts an additional mask for conditioning.
+    This pipeline is designed to work with a UNet fine-tuned to accept 12 input channels
+    (4 for noise, 4 for VAE-encoded condition image, 4 for VAE-encoded mask).
+    """
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+            self,
+            vae: AutoencoderKLTemporalDecoder,
+            image_encoder: CLIPVisionModelWithProjection,
+            unet: UNetSpatioTemporalConditionModel,
+            scheduler: EulerDiscreteScheduler,
+            feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=True, vae_scale_factor=self.vae_scale_factor)
+    def _encode_image(
+            self,
+            image: PipelineImageInput,
+            device: Union[str, torch.device],
+            num_videos_per_prompt: int,
+    ) -> torch.Tensor:
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.video_processor.pil_to_numpy(image)
+            image = self.video_processor.numpy_to_pt(image)
+        image = image * 2.0 - 1.0
+        image = _resize_with_antialiasing(image, (224, 224))
+        image = (image + 1.0) / 2.0
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings
+        # As per your training script, we zero out the embedding
+        image_embeddings = torch.zeros_like(image_embeddings)
+        return image_embeddings
+    def _encode_vae_image(
+            self,
+            image: torch.Tensor,
+            device: Union[str, torch.device],
+            num_videos_per_prompt: int,
+    ):
+        image = image.to(device=device, dtype=torch.float16)
+        image_latents = self.vae.encode(image).latent_dist.sample()
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        return image_latents
+    def _get_add_time_ids(
+            self,
+            fps: int,
+            motion_bucket_id: int,
+            noise_aug_strength: float,
+            dtype: torch.dtype,
+            batch_size: int,
+            num_videos_per_prompt: int,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        return add_time_ids
+    def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int = 14):
+        latents = latents.flatten(0, 1).to(dtype=torch.float16)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i: i + decode_chunk_size].shape[0]
+            frame = self.vae.decode(latents[i: i + decode_chunk_size], num_frames=num_frames_in).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        frames = frames.float()
+        return frames
+    def check_inputs(self, image, height, width):
+        if (
+                not isinstance(image, torch.Tensor)
+                and not isinstance(image, PIL.Image.Image)
+                and not isinstance(image, list)
+        ):
+            raise ValueError(f"`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is {type(image)}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def prepare_latents(
+            self,
+            batch_size: int,
+            num_frames: int,
+            height: int,
+            width: int,
+            dtype: torch.dtype,
+            device: Union[str, torch.device],
+            generator: torch.Generator,
+            latents: Optional[torch.Tensor] = None,
+    ):
+        # The number of channels for the initial noise is based on the UNet's out_channels
+        num_channels_latents = self.unet.config.out_channels
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(f"batch size {batch_size} must match the length of the generators {len(generator)}.")
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def _encode_video_vae(
+            self,
+            video_frames: torch.Tensor,  # Expects (B, F, C, H, W)
+            device: Union[str, torch.device],
+    ):
+        video_frames = video_frames.to(device=device, dtype=self.vae.dtype)
+        batch_size, num_frames = video_frames.shape[:2]
+        # Reshape for VAE encoding
+        video_frames_reshaped = video_frames.reshape(batch_size * num_frames, *video_frames.shape[2:])  # (B*F, C, H, W)
+        latents = self.vae.encode(video_frames_reshaped).latent_dist.sample()  # (B*F, C_latent, H_latent, W_latent)
+        # Reshape back to video format
+        latents = latents.reshape(batch_size, num_frames, *latents.shape[1:])  # (B, F, C_latent, H_latent, W_latent)
+        return latents
+    @torch.no_grad()
+    def __call__(
+            self,
+            image: Union[List[PIL.Image.Image], torch.Tensor],
+            mask_image: Union[List[PIL.Image.Image], torch.Tensor],
+            height: int = 576,
+            width: int = 1024,
+            num_frames: Optional[int] = None,
+            fps: int = 7,
+            motion_bucket_id: int = 127,
+            noise_aug_strength: float = 0.0,
+            decode_chunk_size: Optional[int] = None,
+            num_videos_per_prompt: Optional[int] = 1,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.Tensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            mask_noise_strength: float = 0.0,
+    ):
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        if num_frames is None:
+            if isinstance(image, list):
+                num_frames = len(image)
+            else:
+                num_frames = self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        self.check_inputs(image, height, width)
+        self.check_inputs(mask_image, height, width)
+        if isinstance(image, list) and isinstance(mask_image, list):
+            if len(image) != len(mask_image):
+                raise ValueError("`image` and `mask_image` must have the same number of frames.")
+            if num_frames != len(image):
+                logger.warning(
+                    f"Mismatch between `num_frames` ({num_frames}) and number of input images ({len(image)}). Using {len(image)}.")
+                num_frames = len(image)
+        batch_size = 1
+        device = self._execution_device
+        dtype = self.unet.dtype
+        image_for_clip = image[0] if isinstance(image, list) else image[0]
+        image_embeddings = self._encode_image(image_for_clip, device, num_videos_per_prompt)
+        fps = fps - 1
+        image_tensor = self.video_processor.preprocess(image, height=height, width=width).to(device).unsqueeze(0)
+        mask_tensor = self.video_processor.preprocess(mask_image, height=height, width=width).to(
+            device).unsqueeze(0)
+        noise = randn_tensor(image_tensor.shape, generator=generator, device=device, dtype=dtype)
+        image_tensor = image_tensor + noise_aug_strength * noise
+        conditional_latents = self._encode_video_vae(image_tensor, device)
+        conditional_latents = conditional_latents / self.vae.config.scaling_factor
+        if self.unet.config.in_channels == 12:
+            mask_latents = self._encode_video_vae(mask_tensor, device)
+            mask_latents = mask_latents / self.vae.config.scaling_factor
+        elif self.unet.config.in_channels == 9:
+            mask_tensor_gray = mask_tensor.mean(dim=2, keepdim=True)
+            binarized_mask = (mask_tensor_gray > 0.0).to(dtype)
+            b, f, c, h, w = binarized_mask.shape
+            binarized_mask_reshaped = binarized_mask.reshape(b * f, c, h, w)
+            target_size = (height // self.vae_scale_factor, width // self.vae_scale_factor)
+            interpolated_mask = F.interpolate(
+                binarized_mask_reshaped,
+                size=target_size,
+                mode='nearest',
+            )
+            mask_latents = interpolated_mask.reshape(b, f, *interpolated_mask.shape[1:])
+        else:
+            raise ValueError(
+                f"Unsupported number of UNet input channels: {self.unet.config.in_channels}. "
+                "This pipeline only supports 9 (for interpolated mask) or 12 (for VAE mask)."
+            )
+        if mask_noise_strength > 0.0:
+            mask_noise = randn_tensor(mask_latents.shape, generator=generator, device=device, dtype=dtype)
+            mask_latents = mask_latents + mask_noise_strength * mask_noise
+        added_time_ids = self._get_add_time_ids(
+            fps, motion_bucket_id, noise_aug_strength, image_embeddings.dtype, batch_size, num_videos_per_prompt
+        )
+        added_time_ids = added_time_ids.to(device)
+        # **MODIFIED FOR SINGLE-STEP**: Prepare initial noise
+        num_channels_latents = self.unet.config.out_channels
+        shape = (
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        # **MODIFIED FOR SINGLE-STEP**: Set a fixed high timestep
+        timestep = torch.tensor([1.0], dtype=dtype, device=device)  # Use a high sigma value
+        # **MODIFIED FOR SINGLE-STEP**: Single forward pass
+        latent_model_input = torch.cat([latents, conditional_latents, mask_latents], dim=2)
+        noise_pred = self.unet(
+            latent_model_input, timestep, encoder_hidden_states=image_embeddings, added_time_ids=added_time_ids,
+            return_dict=False
+        )[0]
+        # The model's prediction is the final denoised latent
+        denoised_latents = noise_pred
+        frames = self.decode_latents(denoised_latents, num_frames, decode_chunk_size)
+        frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)
+class StableVideoDiffusionPipelineWithCrossAtnnMask(DiffusionPipeline):
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+    def __init__(
+            self,
+            vae: AutoencoderKLTemporalDecoder,
+            unet: UNetSpatioTemporalConditionModel,
+            scheduler: EulerDiscreteScheduler,
+            mask_projector: torch.nn.Module,
+            # CLIP models are not strictly needed for inference if embeddings are not used
+            image_encoder: CLIPVisionModelWithProjection = None,
+            feature_extractor: CLIPImageProcessor = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            unet=unet,
+            scheduler=scheduler,
+            mask_projector=mask_projector,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
+    def _encode_image_vae(self, image: torch.Tensor, device: Union[str, torch.device]):
+        image = image.to(device=device, dtype=self.vae.dtype)
+        latent = self.vae.encode(image).latent_dist.sample()
+        return latent
+    def decode_latents(self, latents: torch.Tensor, num_frames: int, decode_chunk_size: int):
+        latents = latents.flatten(0, 1).to(dtype=torch.float16)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            frame = self.vae.decode(latents[i: i + decode_chunk_size], num_frames=decode_chunk_size).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+        frames = frames.float()
+        return frames
+    def _encode_video_vae(
+            self,
+            video_frames: torch.Tensor,  # Expects (B, F, C, H, W)
+            device: Union[str, torch.device],
+    ):
+        video_frames = video_frames.to(device=device, dtype=self.vae.dtype)
+        batch_size, num_frames = video_frames.shape[:2]
+        # Reshape for VAE encoding
+        video_frames_reshaped = video_frames.reshape(batch_size * num_frames, *video_frames.shape[2:])  # (B*F, C, H, W)
+        latents = self.vae.encode(video_frames_reshaped).latent_dist.sample()  # (B*F, C_latent, H_latent, W_latent)
+        # Reshape back to video format
+        latents = latents.reshape(batch_size, num_frames, *latents.shape[1:])  # (B, F, C_latent, H_latent, W_latent)
+        return latents
+    @torch.no_grad()
+    def __call__(
+            self,
+            image: Union[PIL.Image.Image, torch.Tensor],  # Static image for appearance
+            mask_image: List[PIL.Image.Image],  # Video mask for motion
+            height: int = 576,
+            width: int = 1024,
+            num_frames: Optional[int] = None,
+            num_inference_steps: int = 25,
+            fps: int = 7,
+            motion_bucket_id: int = 127,
+            noise_aug_strength: float = 0.0,  # Noise is added to latents now
+            decode_chunk_size: Optional[int] = 8,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+    ):
+        device = self._execution_device
+        dtype = self.unet.dtype
+        num_frames = num_frames if num_frames is not None else len(mask_image)
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. PREPARE STATIC IMAGE CONDITION
+        image_tensor = self.video_processor.preprocess(image, height, width).to(device).unsqueeze(0)
+        conditional_latents = self._encode_video_vae(image_tensor, device)
+        conditional_latents = conditional_latents / self.vae.config.scaling_factor
+        # 2. PREPARE MASK MOTION CONDITION
+        mask_tensor = self.video_processor.preprocess(mask_image, height, width)
+        if mask_tensor.shape[1] > 1:
+            mask_tensor = mask_tensor.mean(dim=1, keepdim=True)
+        # Reshape for projector: (T, C, H, W)
+        mask_for_projection = rearrange(mask_tensor, "f c h w -> f c h w").to(device, dtype)
+        encoder_hidden_states = self.mask_projector(mask_for_projection)
+        encoder_hidden_states = encoder_hidden_states.unsqueeze(1)  # (T, 1, D)
+        # Add batch dimension for UNet
+        encoder_hidden_states = encoder_hidden_states.unsqueeze(0)  # (1, T, 1, D)
+        # The UNet will handle flattening this to (B*T, 1, D) where B=1
+        # To be safe, we pass it pre-flattened.
+        encoder_hidden_states = rearrange(encoder_hidden_states, "b f s d -> (b f) s d")
+        # 3. PREPARE LATENTS
+        shape = (1, num_frames, self.unet.config.out_channels, height // self.vae_scale_factor,
+                 width // self.vae_scale_factor)
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        if noise_aug_strength > 0:
+            latents += noise_aug_strength * randn_tensor(latents.shape, generator=generator, device=device,
+                                                         dtype=dtype)
+        latents = latents * self.scheduler.init_noise_sigma
+        # 4. GET ADDED TIME IDS
+        # For pipeline, batch size is 1
+        added_time_ids = [fps - 1, motion_bucket_id, 0.0]  # noise_aug_strength for add_time_ids is 0 for inference
+        added_time_ids = torch.tensor([added_time_ids], dtype=dtype, device=device)
+        # 5. DENOISING LOOP
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for t in timesteps:
+                latent_model_input = self.scheduler.scale_model_input(latents, t)
+                unet_input = torch.cat([latent_model_input, conditional_latents], dim=2)
+                noise_pred = self.unet(
+                    unet_input, t, encoder_hidden_states=encoder_hidden_states, added_time_ids=added_time_ids
+                ).sample
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                progress_bar.update()
+        # 6. DECODE
+        frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+        frames = self.video_processor.postprocess_video(video=frames, output_type=output_type)
+        if not return_dict:
+            return (frames,)
+        return StableVideoDiffusionPipelineOutput(frames=frames)
+# pipeline.py
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from einops import rearrange
+from torchvision import transforms
+from diffusers import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+class VideoInferencePipeline:
+    """
+    A reusable pipeline for single-step video diffusion inference.
+    This class encapsulates the models and the core inference logic,
+    separating it from data loading and saving, which can vary between tasks.
+    """
+    def __init__(self, base_model_path: str, unet_checkpoint_path: str, device: str = "cuda",
+                 weight_dtype: torch.dtype = torch.float16):
+        """
+        Loads all necessary models into memory.
+        Args:
+            base_model_path (str): Path to the base Stable Video Diffusion model.
+            unet_checkpoint_path (str): Path to the fine-tuned UNet checkpoint.
+            device (str): The device to run models on ('cuda' or 'cpu').
+            weight_dtype (torch.dtype): The precision for model weights (float16 or bfloat16).
+        """
+        print("--- Initializing Inference Pipeline and Loading Models ---")
+        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
+        self.weight_dtype = weight_dtype
+        # Load models from pretrained paths
+        try:
+            self.feature_extractor = CLIPImageProcessor.from_pretrained(base_model_path, subfolder="feature_extractor")
+            self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(base_model_path,
+                                                                               subfolder="image_encoder",
+                                                                               variant="fp16")
+            self.vae = AutoencoderKLTemporalDecoder.from_pretrained(base_model_path, subfolder="vae", variant="fp16")
+            self.unet = UNetSpatioTemporalConditionModel.from_pretrained(unet_checkpoint_path, subfolder="unet")
+        except Exception as e:
+            raise IOError(f"Fatal error loading models: {e}")
+        # Move models to the specified device and set to evaluation mode
+        self.image_encoder.to(self.device, dtype=self.weight_dtype).eval()
+        self.vae.to(self.device, dtype=self.weight_dtype).eval()
+        self.unet.to(self.device, dtype=self.weight_dtype).eval()
+        print(f"--- Models Loaded Successfully on {self.device} ---")
+    def run(self, cond_frames, mask_frames, seed=42, mask_cond_mode="vae", fps=7, motion_bucket_id=127,
+            noise_aug_strength=0.0):
+        """
+        Runs the core inference process on a sequence of conditioning and mask frames.
+        Args:
+            cond_frames (list[Image.Image]): List of PIL images for conditioning.
+            mask_frames (list[Image.Image]): List of PIL images for the masks.
+            seed (int): Random seed for generation.
+            mask_cond_mode (str): How the mask is conditioned ("vae" or "interpolate").
+            fps (int): Frames per second to condition the model with.
+            motion_bucket_id (int): Motion bucket ID for conditioning.
+            noise_aug_strength (float): Noise augmentation strength.
+        Returns:
+            list[Image.Image]: A list of the generated video frames as PIL Images.
+        """
+        # --- 1. Prepare Tensors ---
+        cond_video_tensor = self._pil_to_tensor(cond_frames).to(self.device)
+        mask_video_tensor = self._pil_to_tensor(mask_frames).to(self.device)
+        if mask_video_tensor.shape[2] != 3:
+            mask_video_tensor = mask_video_tensor.repeat(1, 1, 3, 1, 1)
+        with torch.no_grad():
+            # --- 2. Get CLIP Image Embeddings ---
+            first_frame_tensor = cond_video_tensor[:, 0, :, :, :]
+            pixel_values_for_clip = self._resize_with_antialiasing(first_frame_tensor, (224, 224))
+            pixel_values_for_clip = ((pixel_values_for_clip + 1.0) / 2.0).clamp(0, 1)
+            pixel_values = self.feature_extractor(images=pixel_values_for_clip, return_tensors="pt").pixel_values
+            image_embeddings = self.image_encoder(pixel_values.to(self.device, dtype=self.weight_dtype)).image_embeds
+            encoder_hidden_states = torch.zeros_like(image_embeddings).unsqueeze(1)
+            # --- 3. Prepare Latents ---
+            cond_latents = self._tensor_to_vae_latent(cond_video_tensor.to(self.weight_dtype))
+            cond_latents = cond_latents / self.vae.config.scaling_factor
+            if mask_cond_mode == "vae":
+                mask_latents = self._tensor_to_vae_latent(mask_video_tensor.to(self.weight_dtype))
+                mask_latents = mask_latents / self.vae.config.scaling_factor
+            elif mask_cond_mode == "interpolate":
+                target_shape = cond_latents.shape[-2:]
+                b, t, c, h, w = mask_video_tensor.shape
+                mask_video_reshaped = rearrange(mask_video_tensor, "b t c h w -> (b t) c h w")
+                interpolated_mask = F.interpolate(mask_video_reshaped, size=target_shape, mode='bilinear',
+                                                  align_corners=False)
+                mask_latents = rearrange(interpolated_mask, "(b t) c h w -> b t c h w", b=b)
+            else:
+                raise ValueError(f"Unknown mask_cond_mode: {mask_cond_mode}")
+            # --- 4. Run UNet Single-Step Inference ---
+            generator = torch.Generator(device=self.device).manual_seed(seed)
+            noisy_latents = torch.randn(cond_latents.shape, generator=generator, device=self.device,
+                                        dtype=self.weight_dtype)
+            timesteps = torch.full((1,), 1.0, device=self.device, dtype=torch.long)
+            added_time_ids = self._get_add_time_ids(fps, motion_bucket_id, noise_aug_strength, batch_size=1)
+            unet_input = torch.cat([noisy_latents, cond_latents, mask_latents], dim=2)
+            pred_latents = self.unet(unet_input, timesteps, encoder_hidden_states, added_time_ids=added_time_ids).sample
+            # --- 5. Decode Latents to Video Frames ---
+            pred_latents = (1 / self.vae.config.scaling_factor) * pred_latents.squeeze(0)
+            frames = []
+            # Process in chunks to avoid VRAM issues, especially for long videos
+            for i in range(0, pred_latents.shape[0], 8):
+                chunk = pred_latents[i: i + 8]
+                decoded_chunk = self.vae.decode(chunk, num_frames=chunk.shape[0]).sample
+                frames.append(decoded_chunk)
+            video_tensor = torch.cat(frames, dim=0)
+            video_tensor = (video_tensor / 2.0 + 0.5).clamp(0, 1).mean(dim=1, keepdim=True).repeat(1, 3, 1, 1)
+            # Return a list of PIL images
+            return [transforms.ToPILImage()(frame) for frame in video_tensor]
+    def _pil_to_tensor(self, frames: list[Image.Image]):
+        """Converts a list of PIL images to a normalized video tensor."""
+        video_tensor = torch.stack([transforms.ToTensor()(f) for f in frames]).unsqueeze(0)
+        return video_tensor * 2.0 - 1.0
+    def _tensor_to_vae_latent(self, t: torch.Tensor):
+        """Encodes a video tensor into the VAE's latent space."""
+        video_length = t.shape[1]
+        t = rearrange(t, "b f c h w -> (b f) c h w")
+        latents = self.vae.encode(t).latent_dist.sample()
+        latents = rearrange(latents, "(b f) c h w -> b f c h w", f=video_length)
+        return latents * self.vae.config.scaling_factor
+    def _get_add_time_ids(self, fps, motion_bucket_id, noise_aug_strength, batch_size):
+        """Creates the additional time IDs for conditioning the UNet."""
+        add_time_ids_list = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids_list)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created.")
+        add_time_ids = torch.tensor([add_time_ids_list], dtype=self.weight_dtype, device=self.device)
+        return add_time_ids.repeat(batch_size, 1)
+    def _resize_with_antialiasing(self, input_tensor, size, interpolation="bicubic", align_corners=True):
+        """
+        Resizes a tensor with anti-aliasing for CLIP input, mirroring k-diffusion.
+        This is a direct copy of the helper function from your original scripts.
+        """
+        h, w = input_tensor.shape[-2:]
+        factors = (h / size[0], w / size[1])
+        sigmas = (max((factors[0] - 1.0) / 2.0, 0.001), max((factors[1] - 1.0) / 2.0, 0.001))
+        ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+        if (ks[0] % 2) == 0: ks = ks[0] + 1, ks[1]
+        if (ks[1] % 2) == 0: ks = ks[0], ks[1] + 1
+        def _compute_padding(kernel_size):
+            computed = [k - 1 for k in kernel_size]
+            out_padding = 2 * len(kernel_size) * [0]
+            for i in range(len(kernel_size)):
+                computed_tmp = computed[-(i + 1)]
+                pad_front = computed_tmp // 2
+                pad_rear = computed_tmp - pad_front
+                out_padding[2 * i + 0] = pad_front
+                out_padding[2 * i + 1] = pad_rear
+            return out_padding
+        def _filter2d(input_tensor, kernel):
+            b, c, h, w = input_tensor.shape
+            tmp_kernel = kernel[:, None, ...].to(device=input_tensor.device, dtype=input_tensor.dtype)
+            tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+            height, width = tmp_kernel.shape[-2:]
+            padding_shape = _compute_padding([height, width])
+            input_tensor_padded = F.pad(input_tensor, padding_shape, mode="reflect")
+            tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+            input_tensor_padded = input_tensor_padded.view(-1, tmp_kernel.size(0), input_tensor_padded.size(-2),
+                                                           input_tensor_padded.size(-1))
+            output = F.conv2d(input_tensor_padded, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+            return output.view(b, c, h, w)
+        def _gaussian(window_size, sigma):
+            if isinstance(sigma, float):
+                sigma = torch.tensor([[sigma]])
+            x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(
+                sigma.shape[0], -1)
+            if window_size % 2 == 0:
+                x = x + 0.5
+            gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+            return gauss / gauss.sum(-1, keepdim=True)
+        def _gaussian_blur2d(input_tensor, kernel_size, sigma):
+            if isinstance(sigma, tuple):
+                sigma = torch.tensor([sigma], dtype=input_tensor.dtype)
+            else:
+                sigma = sigma.to(dtype=input_tensor.dtype)
+            ky, kx = int(kernel_size[0]), int(kernel_size[1])
+            bs = sigma.shape[0]
+            kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+            kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+            out_x = _filter2d(input_tensor, kernel_x[..., None, :])
+            return _filter2d(out_x, kernel_y[..., None])
+        blurred_input = _gaussian_blur2d(input_tensor, ks, sigmas)
+        return F.interpolate(blurred_input, size=size, mode=interpolation, align_corners=align_corners)