hkomp commited on Apr 4, 2025

Commit

a13d12f

1 Parent(s): a00b35e

Add model and code

Browse files

Files changed (26) hide show

diffusers_sv3d/__init__.py +2 -0
diffusers_sv3d/__pycache__/__init__.cpython-311.pyc +0 -0
diffusers_sv3d/models/__init__.py +1 -0
diffusers_sv3d/models/__pycache__/__init__.cpython-311.pyc +0 -0
diffusers_sv3d/models/unets/__init__.py +1 -0
diffusers_sv3d/models/unets/__pycache__/__init__.cpython-311.pyc +0 -0
diffusers_sv3d/models/unets/__pycache__/unet_spatio_temporal_condition.cpython-311.pyc +0 -0
diffusers_sv3d/models/unets/unet_spatio_temporal_condition.py +483 -0
diffusers_sv3d/pipelines/__init__.py +1 -0
diffusers_sv3d/pipelines/__pycache__/__init__.cpython-311.pyc +0 -0
diffusers_sv3d/pipelines/stable_video_diffusion/__init__.py +2 -0
diffusers_sv3d/pipelines/stable_video_diffusion/__pycache__/__init__.cpython-311.pyc +0 -0
diffusers_sv3d/pipelines/stable_video_diffusion/__pycache__/pipeline_stable_video_3d_diffusion.cpython-311.pyc +0 -0
diffusers_sv3d/pipelines/stable_video_diffusion/__pycache__/pipeline_stable_video_3d_diffusion_rotate.cpython-311.pyc +0 -0
diffusers_sv3d/pipelines/stable_video_diffusion/pipeline_stable_video_3d_diffusion.py +469 -0
diffusers_sv3d/pipelines/stable_video_diffusion/pipeline_stable_video_3d_diffusion_rotate.py +371 -0
pretrained_sv3d/feature_extractor/preprocessor_config.json +27 -0
pretrained_sv3d/image_encoder/config.json +23 -0
pretrained_sv3d/image_encoder/model.safetensors +3 -0
pretrained_sv3d/model_index.json +3 -0
pretrained_sv3d/scheduler/scheduler_config.json +22 -0
pretrained_sv3d/unet/config.json +37 -0
pretrained_sv3d/unet/diffusion_pytorch_model.safetensors +3 -0
pretrained_sv3d/vae/config.json +38 -0
pretrained_sv3d/vae/diffusion_pytorch_model.safetensors +3 -0
train.py +79 -0

diffusers_sv3d/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .models import SV3DUNetSpatioTemporalConditionModel
2	+ from .pipelines import StableVideo3DDiffusionPipeline, StableVideo3DDiffusionPipelineRotate

diffusers_sv3d/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (378 Bytes). View file

diffusers_sv3d/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .unets import SV3DUNetSpatioTemporalConditionModel

diffusers_sv3d/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (251 Bytes). View file

diffusers_sv3d/models/unets/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .unet_spatio_temporal_condition import SV3DUNetSpatioTemporalConditionModel

diffusers_sv3d/models/unets/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (282 Bytes). View file

diffusers_sv3d/models/unets/__pycache__/unet_spatio_temporal_condition.cpython-311.pyc ADDED Viewed

Binary file (24.2 kB). View file

diffusers_sv3d/models/unets/unet_spatio_temporal_condition.py ADDED Viewed

	@@ -0,0 +1,483 @@

+from typing import *
+from diffusers.models.unets.unet_spatio_temporal_condition import *
+# Copied from diffusers.models.unets.unet_spatio_temporal_condition UNetSpatioTemporalConditionModel
+class SV3DUNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and
+    returns a sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        addition_time_embed_dim: (`int`, defaults to 256):
+            Dimension to to encode the additional time ids.
+        projection_class_embeddings_input_dim (`int`, defaults to 768):
+            The dimension of the projection of encoded `added_time_ids`.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unets.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unets.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
+            The number of attention heads.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 8,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "DownBlockSpatioTemporal",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        addition_time_embed_dim: int = 256,
+        projection_class_embeddings_input_dim: int = 768,
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        cross_attention_dim: Union[int, Tuple[int]] = 1024,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20),
+        num_frames: int = 25,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            padding=1,
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
+        self.add_angle_proj = Timesteps(2*addition_time_embed_dim, True, downscale_freq_shift=0)  # encode camera angles
+        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-5,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlockSpatioTemporal(
+            block_out_channels[-1],
+            temb_channels=blocks_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=1e-5,
+                resolution_idx=i,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: Union[torch.Tensor, List[torch.Tensor]],
+        return_dict: bool = True,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+        r"""
+        The [`UNetSpatioTemporalConditionModel`] forward method.
+        Args:
+            sample (`torch.Tensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.Tensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead
+                of a plain tuple.
+        Returns:
+            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb)
+        if  isinstance(added_time_ids, torch.Tensor):
+            time_embeds = self.add_time_proj(added_time_ids.flatten())
+            time_embeds = time_embeds.reshape((batch_size, -1))
+            time_embeds = time_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(time_embeds)
+            emb = emb + aug_emb
+            # Repeat the embeddings num_video_frames times
+            # emb: [batch, channels] -> [batch * frames, channels]
+            emb = emb.repeat_interleave(num_frames, dim=0)
+        elif isinstance(added_time_ids, list):
+            # Repeat the embeddings num_video_frames times
+            # emb: [batch, channels] -> [batch * frames, channels]
+            emb = emb.repeat_interleave(num_frames, dim=0)
+            cond_aug = added_time_ids[0]
+            cond_aug_emb = self.add_time_proj(cond_aug.flatten())
+            time_embeds = cond_aug_emb
+            time_embeds = time_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(time_embeds)
+            emb = emb + aug_emb
+        else:
+            raise ValueError
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
+        # Taken care of in the pipeline (to allow reference manipulations)
+        # encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    image_only_indicator=image_only_indicator,
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+        if not return_dict:
+            return (sample,)
+        return UNetSpatioTemporalConditionOutput(sample=sample)

diffusers_sv3d/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .stable_video_diffusion import StableVideo3DDiffusionPipeline, StableVideo3DDiffusionPipelineRotate

diffusers_sv3d/pipelines/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (318 Bytes). View file

diffusers_sv3d/pipelines/stable_video_diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .pipeline_stable_video_3d_diffusion import StableVideo3DDiffusionPipeline
2	+ from .pipeline_stable_video_3d_diffusion_rotate import StableVideo3DDiffusionPipelineRotate

diffusers_sv3d/pipelines/stable_video_diffusion/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (418 Bytes). View file

diffusers_sv3d/pipelines/stable_video_diffusion/__pycache__/pipeline_stable_video_3d_diffusion.cpython-311.pyc ADDED Viewed

Binary file (24.2 kB). View file

diffusers_sv3d/pipelines/stable_video_diffusion/__pycache__/pipeline_stable_video_3d_diffusion_rotate.cpython-311.pyc ADDED Viewed

Binary file (21.1 kB). View file

diffusers_sv3d/pipelines/stable_video_diffusion/pipeline_stable_video_3d_diffusion.py ADDED Viewed

	@@ -0,0 +1,469 @@

+from typing import Dict, List, Optional, Union
+import PIL.Image
+import torch
+from diffusers.models.attention_processor import AttnProcessor2_0
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    StableVideoDiffusionPipeline,
+    _append_dims,
+    randn_tensor,
+    retrieve_timesteps,
+)
+from self_attn_swap import ACTIVATE_LAYER_CANDIDATE_SV3D, SharedAttentionProcessorThree
+# Constants
+HEIGHT = 576
+WIDTH = 576
+NUM_FRAMES = 21
+NOISE_AUG_STRENGTH = 1e-5
+DECODE_CHUNK_SIZE = 2
+NUM_VID = 1
+BATCH_SIZE = 1
+MIN_CFG = 1.0
+MAX_CFG = 2.5
+class StableVideo3DDiffusionPipeline(StableVideoDiffusionPipeline):
+    def __init__(self, vae, image_encoder, unet, scheduler, feature_extractor):
+        super().__init__(vae, image_encoder, unet, scheduler, feature_extractor)
+    def _get_add_time_ids(
+        self, dtype: torch.dtype, num_processes, do_classifier_free_guidance: bool
+    ) -> List[torch.Tensor]:
+        cond_aug = torch.tensor([NOISE_AUG_STRENGTH] * 21, dtype=dtype).repeat(BATCH_SIZE * num_processes, 1)
+        if do_classifier_free_guidance:
+            cond_aug = torch.cat([cond_aug, cond_aug])
+        add_time_ids = [cond_aug]
+        self.unet.to(dtype=torch.float16)
+        self.vae.to(dtype=torch.float16)
+        return add_time_ids
+    def prepare_video_latents(
+        self,
+        images: List[torch.Tensor],
+        timestep: torch.Tensor,
+        add_noise: bool = True,
+        refine_frames: Optional[int] = None,
+        original_latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Prepare video latents by encoding frames and optionally adding noise."""
+        encoded_frames = [self._encode_vae_image(image, self.device, NUM_VID, False) for image in images]
+        encoded_frames = [frame.to(images[0].dtype) for frame in encoded_frames]
+        # TODO: check scaling factor?
+        encoded_frames = [self.vae.config.scaling_factor * frame for frame in encoded_frames]
+        if add_noise:
+            video_latents = [
+                self.scheduler.add_noise(
+                    frame,
+                    randn_tensor(encoded_frames[0].shape, self.generator, self.device, images[0].dtype),
+                    timestep,
+                )
+                for frame in encoded_frames
+            ]
+        else:
+            video_latents = encoded_frames
+        if refine_frames is not None and original_latents is not None:
+            video_latents = encoded_frames
+            for i in range(len(video_latents)):
+                if i in refine_frames:
+                    video_latents[i] = original_latents[i].unsqueeze(0)
+        return torch.stack(video_latents, dim=1)
+    def activate_layers(self, config: Dict[str, List[Union[float, int]]], swapping_type="linear") -> Dict[str, AttnProcessor2_0]:
+        """Activate swapping attention mechanism in specific UNet layers."""
+        # Setup default values first
+        default_attn_procs = {}
+        for layer in self.unet.attn_processors.keys():
+            default_attn_procs[layer] = AttnProcessor2_0()
+        self.unet.set_attn_processor(default_attn_procs)
+        spatial_attn = [layer for layer in ACTIVATE_LAYER_CANDIDATE_SV3D if ".transformer_blocks.0.attn1" in layer]
+        temporal_attn = [
+            layer for layer in ACTIVATE_LAYER_CANDIDATE_SV3D if ".temporal_transformer_blocks.0.attn1" in layer
+        ]
+        assert len(spatial_attn) == len(config["spatial_ratio"]) == len(config["spatial_strength"])
+        assert len(temporal_attn) == len(config["temporal_ratio"]) == len(config["temporal_strength"])
+        ratios = {}
+        for layer, ratio, strength in zip(spatial_attn, config["spatial_ratio"], config["spatial_strength"]):
+            ratios[layer] = {"ratio": ratio, "strength": strength}
+        for layer, ratio, strength in zip(temporal_attn, config["temporal_ratio"], config["temporal_strength"]):
+            ratios[layer] = {"ratio": ratio, "strength": strength}
+        attn_procs = {}
+        for layer in self.unet.attn_processors.keys():
+            if layer in ratios:
+                attn_procs[layer] = SharedAttentionProcessorThree(
+                    unet_chunk_size=2, activate_step_indices=config["activate_steps"], ratio=ratios[layer], swapping_type=swapping_type
+                )
+            else:
+                attn_procs[layer] = AttnProcessor2_0()
+        self.unet.set_attn_processor(attn_procs)
+        return attn_procs
+    def _decode_vae_frames(self, image_latents: torch.Tensor) -> torch.Tensor:
+        frames = []
+        for i in range(21):
+            frame = self.vae.decode(image_latents[:, i], self.device).sample
+            frames.append(frame)
+        return torch.stack(frames, dim=2)
+    def _preprocess_reference_images(self, reference_images: List[PIL.Image.Image]) -> List[torch.Tensor]:
+        """Helper method to preprocess reference images consistently"""
+        processed_images = []
+        for image in reference_images:
+            ref_image = self.video_processor.preprocess(image, HEIGHT, WIDTH).to(self.device)
+            ref_noise = randn_tensor(ref_image.shape, self.generator, self.device, ref_image.dtype)
+            ref_image = ref_image + NOISE_AUG_STRENGTH * ref_noise
+            processed_images.append(ref_image)
+        return processed_images
+    def _preprocess_image(self, image: Union[PIL.Image.Image, torch.Tensor]) -> torch.Tensor:
+        """Preprocess a single image with noise augmentation"""
+        processed = self.video_processor.preprocess(image, HEIGHT, WIDTH).to(self.device)
+        noise = randn_tensor(processed.shape, self.generator, self.device, processed.dtype)
+        return processed + NOISE_AUG_STRENGTH * noise
+    def _denoise_loop(
+        self,
+        latents: torch.Tensor,
+        image_latents: torch.Tensor,
+        image_embeddings: torch.Tensor,
+        added_time_ids: List[torch.Tensor],
+        timesteps: torch.Tensor,
+        z0_reference_images: Optional[List[torch.Tensor]] = None,
+        z0_shape_images: Optional[List[torch.Tensor]] = None,
+        refinement: bool = False,
+        refine_frames: Optional[list] = None,
+        z0_mid_images: Optional[List[torch.Tensor]] = None,
+        output_type: str = "pil",
+        add_noise: bool = True,
+    ):
+        num_warmup_steps = len(timesteps) - self.num_inference_steps * self.scheduler.order
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        intermediate_steps = []
+        normal_latents = None
+        with torch.autocast(device_type=self.device.type, dtype=torch.float16):
+            with self.progress_bar(total=self.num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if i in self.replace_reference_steps:
+                        latents[0] = self.prepare_video_latents(
+                            z0_reference_images,
+                            timestep=t.repeat(1),
+                            add_noise=add_noise,
+                        )
+                        if refinement and z0_mid_images is not None:
+                            latents[1] = self.prepare_video_latents(
+                                z0_mid_images,
+                                timestep=t.repeat(1),
+                                add_noise=add_noise,
+                                refine_frames=refine_frames,
+                                original_latents=latents[1],
+                            )
+                        if refinement and z0_shape_images is not None:
+                            latents[2] = self.prepare_video_latents(
+                                z0_shape_images,
+                                timestep=t.repeat(1),
+                                add_noise=add_noise,
+                            )
+                    # expand the latents if we are doing cfg
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    # Concatenate image_latents over channels dimension
+                    latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                    torch.cuda.empty_cache()
+                    print(latent_model_input.shape, t, image_embeddings.shape, added_time_ids[0].shape)
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,  # 2/4/6,21,8,72,72
+                        t,  # float
+                        encoder_hidden_states=image_embeddings,  # 42/84/126,1,1024
+                        added_time_ids=added_time_ids,  # 2/4/6,21
+                        return_dict=False,
+                    )[0]  # 1/2/3,21,4,72,72
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    step_output = self.scheduler.step(noise_pred, t, latents)  # EulerDiscreteScheduler
+                    latents = step_output.prev_sample
+                    normal_latents = step_output.pred_original_sample
+                    if self.return_intermediate_steps:
+                        if needs_upcasting:
+                            self.vae.to(dtype=torch.float16)
+                        frames = self.decode_latents(normal_latents, NUM_FRAMES, DECODE_CHUNK_SIZE)
+                        frames = self.video_processor.postprocess_video(frames, "pil")
+                        intermediate_steps.append(frames)
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents, NUM_FRAMES, DECODE_CHUNK_SIZE)
+            frames = self.video_processor.postprocess_video(frames, output_type)
+        else:
+            frames = latents
+        self.maybe_free_model_hooks()
+        return frames, intermediate_steps
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_image: PIL.Image.Image,
+        reference_images: List[PIL.Image.Image],
+        num_inference_steps: int = 25,
+        replace_reference_steps: List[int] = list(),
+        return_intermediate_steps: bool = False,
+        seed: int = 42,
+        same_starting_latents: bool = True,
+        refinement: bool = False,
+        refine_frames: Optional[list] = None,
+        add_noise: bool = True,
+    ):
+        # 0. Set seed
+        self.generator = torch.manual_seed(seed)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(input_image, HEIGHT, WIDTH)
+        # 2. Define call parameters
+        self.num_inference_steps = num_inference_steps
+        self.return_intermediate_steps = return_intermediate_steps
+        self.replace_reference_steps = replace_reference_steps
+        self._guidance_scale = MAX_CFG
+        #   z0_mid_images = None
+        # 3. Encode input image (CLIP)
+        image_embeddings_combined = [
+            self._encode_image(reference_images[-1], self.device, NUM_VID, self.do_classifier_free_guidance),
+            self._encode_image(input_image, self.device, NUM_VID, self.do_classifier_free_guidance),
+            self._encode_image(input_image, self.device, NUM_VID, self.do_classifier_free_guidance),
+        ]
+        all_embeddings = torch.cat(image_embeddings_combined, dim=0)  # uc, c, uc, c, (uc, c)
+        embeddings_order = torch.tensor([0, 2, 4, 1, 3, 5])
+        reordered_embeddings = all_embeddings[embeddings_order]  # uc, uc, (uc), c, c, (c)
+        image_embeddings = reordered_embeddings.repeat_interleave(NUM_FRAMES, dim=0)
+        # 4. Encode using VAE
+        image = self._preprocess_image(input_image)
+        ref_image = self._preprocess_image(reference_images[-1])
+        z0_reference_images = self._preprocess_reference_images(reference_images)
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        image_latents = self._encode_vae_image(image, self.device, NUM_VID, self.do_classifier_free_guidance)
+        image_latents = image_latents.to(image_embeddings.dtype)
+        ref_image_latents = self._encode_vae_image(ref_image, self.device, NUM_VID, self.do_classifier_free_guidance)
+        ref_image_latents = ref_image_latents.to(image_embeddings.dtype)
+        image_latents_full = [
+            ref_image_latents.unsqueeze(1).repeat(1, NUM_FRAMES, 1, 1, 1),
+            image_latents.unsqueeze(1).repeat(1, NUM_FRAMES, 1, 1, 1),
+            image_latents.unsqueeze(1).repeat(1, NUM_FRAMES, 1, 1, 1),
+        ]
+        image_latents = torch.cat(image_latents_full, dim=0)
+        image_latents_order = torch.tensor([0, 2, 4, 1, 3, 5])
+        image_latents = image_latents[image_latents_order]
+        if needs_upcasting:  # cast back to fp16 if needed
+            self.vae.to(dtype=torch.float16)
+        num_processes = 3
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            image_embeddings.dtype,
+            num_processes,
+            self.do_classifier_free_guidance,
+        )  # list of tensor [2, 21] or [4, 21] or [6, 21] -> just 4x the same
+        added_time_ids = [a.to(self.device) for a in added_time_ids]
+        timesteps, self.num_inference_steps = retrieve_timesteps(self.scheduler, self.num_inference_steps, self.device)
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels  # 8
+        latents = self.prepare_latents(
+            BATCH_SIZE * num_processes,
+            NUM_FRAMES,
+            num_channels_latents,
+            HEIGHT,
+            WIDTH,
+            image_embeddings.dtype,
+            self.device,
+            self.generator,
+        )  # 2/3,21,4,72,72
+        if same_starting_latents:
+            latents[0] = latents[1] = latents[2]
+        # 8. Prepare guidance scale
+        guidance_scale = torch.cat(
+            [
+                torch.linspace(MIN_CFG, MAX_CFG, NUM_FRAMES // 2 + 1)[1:].unsqueeze(0),
+                torch.linspace(MAX_CFG, MIN_CFG, NUM_FRAMES - NUM_FRAMES // 2 + 1)[1:].unsqueeze(0),
+            ],
+            dim=-1,
+        )
+        guidance_scale = guidance_scale.to(self.device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(BATCH_SIZE, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)  # [1,21,1,1,1]
+        self._guidance_scale = guidance_scale
+        # 9. Denoising loop
+        frames, intemediate_steps = self._denoise_loop(
+            latents=latents,
+            image_latents=image_latents,
+            image_embeddings=image_embeddings,
+            added_time_ids=added_time_ids,
+            timesteps=timesteps,
+            z0_reference_images=z0_reference_images,
+            output_type="pil",
+            add_noise=add_noise,
+        )
+        new_front_image = None
+        if refinement:
+            assert refine_frames is not None
+            current_front_frame_idx = refine_frames[-1]
+            shift = NUM_FRAMES - current_front_frame_idx
+            mid_images = frames[1]
+            shape_images = frames[2]
+            new_front_image = mid_images[current_front_frame_idx]
+            # roll the lists
+            reference_images = reference_images[shift:] + reference_images[:shift]
+            shape_images = shape_images[shift:] + shape_images[:shift]
+            mid_images = mid_images[shift:] + mid_images[:shift]
+            latents = self.prepare_latents(
+                BATCH_SIZE * num_processes,
+                NUM_FRAMES,
+                num_channels_latents,
+                HEIGHT,
+                WIDTH,
+                image_embeddings.dtype,
+                self.device,
+                self.generator,
+            )
+            if same_starting_latents:
+                latents[0] = latents[1] = latents[2]
+            timesteps, self.num_inference_steps = retrieve_timesteps(
+                self.scheduler, self.num_inference_steps, self.device
+            )
+            ref_image = self._preprocess_image(z0_reference_images[-1])
+            ref_image_latents = self._encode_vae_image(
+                ref_image, self.device, NUM_VID, self.do_classifier_free_guidance
+            )
+            ref_image_latents = ref_image_latents.to(image_embeddings.dtype)
+            mid_image = self._preprocess_image(mid_images[-1])
+            mid_image_latents = self._encode_vae_image(
+                mid_image, self.device, NUM_VID, self.do_classifier_free_guidance
+            )
+            mid_image_latents = mid_image_latents.to(image_embeddings.dtype)
+            shape_image = self._preprocess_image(shape_images[-1])
+            shape_image_latents = self._encode_vae_image(
+                shape_image, self.device, NUM_VID, self.do_classifier_free_guidance
+            )
+            shape_image_latents = shape_image_latents.to(image_embeddings.dtype)
+            image_latents_full = [
+                ref_image_latents.unsqueeze(1).repeat(1, NUM_FRAMES, 1, 1, 1),
+                mid_image_latents.unsqueeze(1).repeat(1, NUM_FRAMES, 1, 1, 1),
+                shape_image_latents.unsqueeze(1).repeat(1, NUM_FRAMES, 1, 1, 1),
+            ]
+            image_latents = torch.cat(image_latents_full, dim=0)
+            image_latents = image_latents[image_latents_order]
+            # CLIP embeddings on the new front frame
+            image_embeddings_combined = [
+                self._encode_image(reference_images[-1], self.device, NUM_VID, self.do_classifier_free_guidance),
+                self._encode_image(mid_images[-1], self.device, NUM_VID, self.do_classifier_free_guidance),
+                self._encode_image(shape_images[-1], self.device, NUM_VID, self.do_classifier_free_guidance),
+            ]
+            all_embeddings = torch.cat(image_embeddings_combined, dim=0)  # uc, c, uc, c, (uc, c)
+            embeddings_order = torch.tensor([0, 2, 4, 1, 3, 5])
+            reordered_embeddings = all_embeddings[embeddings_order]  # uc, uc, (uc), c, c, (c)
+            image_embeddings = reordered_embeddings.repeat_interleave(NUM_FRAMES, dim=0)
+            z0_mid_images = self._preprocess_reference_images(mid_images)
+            z0_shape_images = self._preprocess_reference_images(shape_images)
+            frames, intemediate_steps = self._denoise_loop(
+                latents=latents,
+                image_latents=image_latents,
+                image_embeddings=image_embeddings,
+                added_time_ids=added_time_ids,
+                timesteps=timesteps,
+                z0_reference_images=z0_reference_images,
+                z0_shape_images=z0_shape_images,
+                refinement=refinement,
+                refine_frames=refine_frames,
+                z0_mid_images=z0_mid_images,
+                add_noise=add_noise,
+            )
+            # Roll back frames to original order
+            frames = [
+                frames[0][(-shift):] + frames[0][:-shift],
+                frames[1][(-shift):] + frames[1][:-shift],
+                frames[2][(-shift):] + frames[2][:-shift],
+            ]
+        if return_intermediate_steps:
+            return frames, new_front_image, intemediate_steps
+        return frames, new_front_image, None

diffusers_sv3d/pipelines/stable_video_diffusion/pipeline_stable_video_3d_diffusion_rotate.py ADDED Viewed

	@@ -0,0 +1,371 @@

+from typing import Dict, List, Optional, Union
+import PIL.Image
+import torch
+from diffusers.models.attention_processor import AttnProcessor2_0
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    StableVideoDiffusionPipeline,
+    _append_dims,
+    randn_tensor,
+    retrieve_timesteps,
+)
+from self_attn_swap import ACTIVATE_LAYER_CANDIDATE_SV3D, SharedAttentionProcessorThree
+# Constants
+HEIGHT = 576
+WIDTH = 576
+NUM_FRAMES = 21
+NOISE_AUG_STRENGTH = 1e-5
+DECODE_CHUNK_SIZE = 2
+NUM_VID = 1
+GENERATOR = torch.manual_seed(42)
+OUTPUT_TYPE = "pil"
+BATCH_SIZE = 1
+MIN_CFG = 1.0
+MAX_CFG = 2.5
+class StableVideo3DDiffusionPipelineRotate(StableVideoDiffusionPipeline):
+    def __init__(self, vae, image_encoder, unet, scheduler, feature_extractor):
+        super().__init__(vae, image_encoder, unet, scheduler, feature_extractor)
+    def _get_add_time_ids(
+        self, dtype: torch.dtype, num_processes, do_classifier_free_guidance: bool
+    ) -> List[torch.Tensor]:
+        cond_aug = torch.tensor([NOISE_AUG_STRENGTH] * 21, dtype=dtype).repeat(BATCH_SIZE * num_processes, 1)
+        if do_classifier_free_guidance:
+            cond_aug = torch.cat([cond_aug, cond_aug])
+        add_time_ids = [cond_aug]
+        self.unet.to(dtype=torch.float16)
+        self.vae.to(dtype=torch.float16)
+        return add_time_ids
+    def prepare_video_latents(
+        self,
+        images: List[torch.Tensor],
+        timestep: torch.Tensor,
+        add_noise: bool = True,
+        active_size: Optional[int] = None,
+        original_latents: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Prepare video latents by encoding frames and optionally adding noise."""
+        encoded_frames = [self._encode_vae_image(image, self.device, NUM_VID, False) for image in images]
+        encoded_frames = [frame.to(images[0].dtype) for frame in encoded_frames]
+        # TODO: check scaling factor?
+        encoded_frames = [self.vae.config.scaling_factor * frame for frame in encoded_frames]
+        # add noise
+        if add_noise:
+            video_latents = [
+                self.scheduler.add_noise(
+                    frame,
+                    randn_tensor(encoded_frames[0].shape, GENERATOR, self.device, images[0].dtype),
+                    timestep,
+                )
+                for frame in encoded_frames
+            ]
+        else:
+            video_latents = encoded_frames
+        if active_size is not None and original_latents is not None:
+            for i in range(len(video_latents)):
+                if NUM_FRAMES - active_size - 1 <= i < NUM_FRAMES - 1:
+                    video_latents[i] = original_latents[i].unsqueeze(0)
+        return torch.stack(video_latents, dim=1)
+    def activate_layers(self, config: Dict[str, List[Union[float, int]]]) -> Dict[str, AttnProcessor2_0]:
+        """Activate swapping attention mechanism in specific UNet layers."""
+        spatial_attn = [layer for layer in ACTIVATE_LAYER_CANDIDATE_SV3D if ".transformer_blocks.0.attn1" in layer]
+        temporal_attn = [
+            layer for layer in ACTIVATE_LAYER_CANDIDATE_SV3D if ".temporal_transformer_blocks.0.attn1" in layer
+        ]
+        assert len(spatial_attn) == len(config["spatial_ratio"]) == len(config["spatial_strength"])
+        assert len(temporal_attn) == len(config["temporal_ratio"]) == len(config["temporal_strength"])
+        ratios = {}
+        for layer, ratio, strength in zip(spatial_attn, config["spatial_ratio"], config["spatial_strength"]):
+            ratios[layer] = {"ratio": ratio, "strength": strength}
+        for layer, ratio, strength in zip(temporal_attn, config["temporal_ratio"], config["temporal_strength"]):
+            ratios[layer] = {"ratio": ratio, "strength": strength}
+        attn_procs = {}
+        for layer in self.unet.attn_processors.keys():
+            if layer in ratios:
+                attn_procs[layer] = SharedAttentionProcessorThree(
+                    unet_chunk_size=2, activate_step_indices=config["activate_steps"], ratio=ratios[layer]
+                )
+            else:
+                attn_procs[layer] = AttnProcessor2_0()
+        self.unet.set_attn_processor(attn_procs)
+        return attn_procs
+    def _decode_vae_frames(self, image_latents: torch.Tensor) -> torch.Tensor:
+        frames = []
+        for i in range(21):
+            frame = self.vae.decode(image_latents[:, i], self.device).sample
+            frames.append(frame)
+        return torch.stack(frames, dim=2)
+    def _preprocess_reference_images(self, reference_images: List[PIL.Image.Image]) -> List[torch.Tensor]:
+        """Helper method to preprocess reference images consistently"""
+        processed_images = []
+        for image in reference_images:
+            ref_image = self.video_processor.preprocess(image, HEIGHT, WIDTH).to(self.device)
+            ref_noise = randn_tensor(ref_image.shape, GENERATOR, self.device, ref_image.dtype)
+            ref_image = ref_image + NOISE_AUG_STRENGTH * ref_noise
+            processed_images.append(ref_image)
+        return processed_images
+    def _preprocess_image(self, image: Union[PIL.Image.Image, torch.Tensor]) -> torch.Tensor:
+        """Preprocess a single image with noise augmentation"""
+        processed = self.video_processor.preprocess(image, HEIGHT, WIDTH).to(self.device)
+        noise = randn_tensor(processed.shape, GENERATOR, self.device, processed.dtype)
+        return processed + NOISE_AUG_STRENGTH * noise
+    def _denoise_loop(
+        self,
+        latents: torch.Tensor,
+        image_latents: torch.Tensor,
+        image_embeddings: torch.Tensor,
+        added_time_ids: List[torch.Tensor],
+        timesteps: torch.Tensor,
+        mids_active_size: int,
+        z0_mid_images: List[torch.Tensor],
+        z0_reference_images: Optional[List[torch.Tensor]] = None,
+        z0_shape_images: Optional[List[torch.Tensor]] = None,
+    ):
+        num_warmup_steps = len(timesteps) - self.num_inference_steps * self.scheduler.order
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        intermediate_steps = []
+        normal_latents = None
+        with torch.autocast(device_type=self.device.type, dtype=torch.float16):
+            with self.progress_bar(total=self.num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if i in self.replace_reference_steps:
+                        latents[0] = self.prepare_video_latents(
+                            z0_reference_images,
+                            timestep=t.repeat(1),
+                            add_noise=True,
+                        )
+                        latents[1] = self.prepare_video_latents(
+                            z0_mid_images,
+                            timestep=t.repeat(1),
+                            add_noise=True,
+                            active_size=mids_active_size if i > 5 else None,
+                            original_latents=latents[1],
+                        )
+                        if z0_shape_images is not None:
+                            latents[2] = self.prepare_video_latents(
+                                z0_shape_images,
+                                timestep=t.repeat(1),
+                                add_noise=True,
+                            )
+                    # expand the latents if we are doing cfg
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    # Concatenate image_latents over channels dimension
+                    latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                    torch.cuda.empty_cache()
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,  # 2/4/6,21,8,72,72
+                        t,  # float
+                        encoder_hidden_states=image_embeddings,  # 42/84/126,1,1024
+                        added_time_ids=added_time_ids,  # 2/4/6,21
+                        return_dict=False,
+                    )[0]  # 1/2/3,21,4,72,72
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    step_output = self.scheduler.step(noise_pred, t, latents)  # EulerDiscreteScheduler
+                    latents = step_output.prev_sample
+                    normal_latents = step_output.pred_original_sample
+                    if self.return_intermediate_steps:
+                        if needs_upcasting:
+                            self.vae.to(dtype=torch.float16)
+                        frames = self.decode_latents(normal_latents, NUM_FRAMES, DECODE_CHUNK_SIZE)
+                        frames = self.video_processor.postprocess_video(frames, OUTPUT_TYPE)
+                        intermediate_steps.append(frames)
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+        if not OUTPUT_TYPE == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents, NUM_FRAMES, DECODE_CHUNK_SIZE)
+            frames = self.video_processor.postprocess_video(frames, OUTPUT_TYPE)
+        else:
+            frames = latents
+        self.maybe_free_model_hooks()
+        return frames, intermediate_steps
+    @torch.no_grad()
+    def __call__(
+        self,
+        mid_images: List[PIL.Image.Image],
+        reference_images: List[PIL.Image.Image],
+        shape_images: Optional[List[PIL.Image.Image]] = None,
+        num_inference_steps: int = 25,
+        replace_reference_steps: List[int] = list(),
+        return_intermediate_steps: bool = False,
+        mids_active_size: int = 5,
+    ):
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(mid_images[-1], HEIGHT, WIDTH)
+        # 2. Define call parameters
+        self.num_inference_steps = num_inference_steps
+        self.return_intermediate_steps = return_intermediate_steps
+        self.replace_reference_steps = replace_reference_steps
+        self._guidance_scale = MAX_CFG
+        # 3. Encode input image (CLIP)
+        image_embeddings_combined = [
+            self._encode_image(reference_images[-1], self.device, NUM_VID, self.do_classifier_free_guidance),
+            self._encode_image(mid_images[-1], self.device, NUM_VID, self.do_classifier_free_guidance),
+        ]
+        if shape_images is not None:
+            image_embeddings_combined.append(
+                self._encode_image(shape_images[-1], self.device, NUM_VID, self.do_classifier_free_guidance)
+            )
+        all_embeddings = torch.cat(image_embeddings_combined, dim=0)  # uc, c, uc, c, (uc, c)
+        embeddings_order = torch.tensor([0, 2, 4, 1, 3, 5]) if shape_images else torch.tensor([0, 2, 1, 3])
+        reordered_embeddings = all_embeddings[embeddings_order]  # uc, uc, (uc), c, c, (c)
+        image_embeddings = reordered_embeddings.repeat_interleave(NUM_FRAMES, dim=0)
+        # 4. Encode using VAE
+        image = self._preprocess_image(mid_images[-1])
+        ref_image = self._preprocess_image(reference_images[-1])
+        z0_reference_images = self._preprocess_reference_images(reference_images)
+        z0_mid_images = self._preprocess_reference_images(mid_images)
+        if shape_images is not None:
+            shape_image = self._preprocess_image(shape_images[-1])
+            z0_shape_images = self._preprocess_reference_images(
+                shape_images,
+            )
+        else:
+            shape_image = None
+            z0_shape_images = None
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        image_latents = self._encode_vae_image(image, self.device, NUM_VID, self.do_classifier_free_guidance)
+        image_latents = image_latents.to(image_embeddings.dtype)
+        ref_image_latents = self._encode_vae_image(ref_image, self.device, NUM_VID, self.do_classifier_free_guidance)
+        ref_image_latents = ref_image_latents.to(image_embeddings.dtype)
+        if shape_images is not None:
+            shape_image_latents = self._encode_vae_image(
+                shape_image, self.device, NUM_VID, self.do_classifier_free_guidance
+            )
+            shape_image_latents = shape_image_latents.to(image_embeddings.dtype)
+        image_latents_full = [
+            ref_image_latents.unsqueeze(1).repeat(1, NUM_FRAMES, 1, 1, 1),
+            image_latents.unsqueeze(1).repeat(1, NUM_FRAMES, 1, 1, 1),
+        ]
+        if shape_images is not None:
+            shape_image_latents = shape_image_latents.unsqueeze(1).repeat(1, NUM_FRAMES, 1, 1, 1)
+            image_latents_full.append(shape_image_latents)
+        image_latents = torch.cat(image_latents_full, dim=0)
+        image_latents_order = torch.tensor([0, 2, 4, 1, 3, 5]) if shape_images else torch.tensor([0, 2, 1, 3])
+        image_latents = image_latents[image_latents_order]
+        if needs_upcasting:  # cast back to fp16 if needed
+            self.vae.to(dtype=torch.float16)
+        num_processes = 2 if shape_images is None else 3
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            image_embeddings.dtype,
+            num_processes,
+            self.do_classifier_free_guidance,
+        )  # list of tensor [2, 21] or [4, 21] or [6, 21] -> just 4x the same
+        added_time_ids = [a.to(self.device) for a in added_time_ids]
+        timesteps, self.num_inference_steps = retrieve_timesteps(self.scheduler, self.num_inference_steps, self.device)
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels  # 8
+        latents = self.prepare_latents(
+            BATCH_SIZE * num_processes,
+            NUM_FRAMES,
+            num_channels_latents,
+            HEIGHT,
+            WIDTH,
+            image_embeddings.dtype,
+            self.device,
+            GENERATOR,
+        )  # 2/3,21,4,72,72
+        # 8. Prepare guidance scale
+        guidance_scale = torch.cat(
+            [
+                torch.linspace(MIN_CFG, MAX_CFG, NUM_FRAMES // 2 + 1)[1:].unsqueeze(0),
+                torch.linspace(MAX_CFG, MIN_CFG, NUM_FRAMES - NUM_FRAMES // 2 + 1)[1:].unsqueeze(0),
+            ],
+            dim=-1,
+        )
+        guidance_scale = guidance_scale.to(self.device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(BATCH_SIZE, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)  # [1,21,1,1,1]
+        self._guidance_scale = guidance_scale
+        # 9. Denoising loop
+        frames, intemediate_steps = self._denoise_loop(
+            latents=latents,
+            image_latents=image_latents,
+            image_embeddings=image_embeddings,
+            added_time_ids=added_time_ids,
+            timesteps=timesteps,
+            mids_active_size=mids_active_size,
+            z0_mid_images=z0_mid_images,
+            z0_reference_images=z0_reference_images,
+            z0_shape_images=z0_shape_images,
+        )
+        if return_intermediate_steps:
+            return frames, intemediate_steps
+        return frames

pretrained_sv3d/feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

pretrained_sv3d/image_encoder/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "stabilityai/stable-video-diffusion-img2vid-xt",
+  "architectures": [
+    "CLIPVisionModelWithProjection"
+  ],
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_size": 1280,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 32,
+  "patch_size": 14,
+  "projection_dim": 1024,
+  "torch_dtype": "float32",
+  "transformers_version": "4.45.2"
+}

pretrained_sv3d/image_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed1e5af7b4042ca30ec29999a4a5cfcac90b7fb610fd05ace834f2dcbb763eab
+size 2528371296

pretrained_sv3d/model_index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37fe3c7758e588c386817b6e681f2aaa7bc8c212d628b7c36f758e0a6d972e29
+size 492

pretrained_sv3d/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_class_name": "EulerDiscreteScheduler",
+  "_diffusers_version": "0.30.3",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "final_sigmas_type": "zero",
+  "interpolation_type": "linear",
+  "num_train_timesteps": 1000,
+  "prediction_type": "v_prediction",
+  "rescale_betas_zero_snr": false,
+  "set_alpha_to_one": false,
+  "sigma_max": 700.0,
+  "sigma_min": 0.002,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "timestep_spacing": "leading",
+  "timestep_type": "continuous",
+  "trained_betas": null,
+  "use_karras_sigmas": true
+}

pretrained_sv3d/unet/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_class_name": "SV3DUNetSpatioTemporalConditionModel",
+  "_diffusers_version": "0.30.3",
+  "addition_time_embed_dim": 256,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "DownBlockSpatioTemporal"
+  ],
+  "in_channels": 8,
+  "layers_per_block": 2,
+  "num_attention_heads": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "num_frames": 25,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": 256,
+  "sample_size": 72,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal"
+  ]
+}

pretrained_sv3d/unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00d35a0c7e024ebc55feeecf55baa039700f3d2b2d396e58d7cd0e6bbb18eedd
+size 6096060984

pretrained_sv3d/vae/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.30.3",
+  "_name_or_path": "chenguolin/stable-diffusion-v1-5",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "latents_mean": null,
+  "latents_std": null,
+  "layers_per_block": 2,
+  "mid_block_add_attention": true,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 512,
+  "scaling_factor": 0.18215,
+  "shift_factor": null,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "use_post_quant_conv": true,
+  "use_quant_conv": true
+}

pretrained_sv3d/vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d2b5932bb4151e54e694fd31ccf51fca908223c9485bd56cd0e1d83ad94c49
+size 334643268

train.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import torch
+from torch.optim import AdamW
+import torch.nn.functional as F
+from diffusers_sv3d.pipelines.stable_video_diffusion.pipeline_stable_video_3d_diffusion import (
+    StableVideo3DDiffusionPipeline,
+)
+# Configuration
+BATCH_SIZE = 1
+LR = 1e-5
+NUM_EPOCHS = 10
+SAVE_DIR = "checkpoints"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+SV3D_PATH = os.path.abspath("/home/hubert/projects/sv3d-pbr/sv3d_diffusers/pretrained_sv3d")
+def train():
+    # Create directories
+    os.makedirs(SAVE_DIR, exist_ok=True)
+    # Create pipeline
+    pipeline = StableVideo3DDiffusionPipeline.from_pretrained(
+        SV3D_PATH,
+        revision="fp16",
+        torch_dtype=torch.float16,
+    )
+    pipeline.to(DEVICE)
+    # freeze unet parts - freeze everything first
+    for param in pipeline.unet.parameters():
+        param.requires_grad = False
+    # unfreeze only one specific layer (for example, the last output block)
+    for name, param in pipeline.unet.named_parameters():
+        if "down_blocks.2.resnets.0.spatial_res_block.conv1" in name:
+            param.requires_grad = True
+            print(f"Unfreezing: {name}")
+    # Count trainable parameters
+    trainable_params = sum(p.numel() for p in pipeline.unet.parameters() if p.requires_grad)
+    total_params = sum(p.numel() for p in pipeline.unet.parameters())
+    print(f"Trainable parameters: {trainable_params:,} / {total_params:,} ({trainable_params/total_params:.2%})")
+    # Setup optimizer - only train unfrozen parameters
+    optimizer = AdamW([p for p in pipeline.unet.parameters() if p.requires_grad], lr=LR)
+    # Training loop
+    for epoch in range(NUM_EPOCHS):
+        pipeline.unet.train()
+        # Prepare for backward pass
+        optimizer.zero_grad()
+        latents = torch.randn((6,21,8,72,72), dtype=torch.float16).to(DEVICE)
+        t = 0.123
+        encoder_hidden_states = torch.randn((126,1,1024), dtype=torch.float16).to(DEVICE)
+        added_tim_ids = torch.randn((6,21), dtype=torch.float16).to(DEVICE)
+        target_noise = torch.randn((6,21,8,72,72), dtype=torch.float16).to(DEVICE)
+        noise_pred = pipeline.unet(
+            latents,
+            t,
+            encoder_hidden_states=encoder_hidden_states,
+            added_time_ids=[added_tim_ids],
+        )
+        print(noise_pred.shape)
+        # loss = F.mse_loss(noise_pred, target_noise)
+        # Backward pass and optimizer step
+        # loss.backward()
+        # optimizer.step()
+if __name__ == "__main__":
+    train()