xixircc commited on 16 days ago

Commit

d9fac04

verified ·

1 Parent(s): bd29c5e

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -0
blaze_face_short_range.tflite +3 -0
face-parsing/.gitattributes +28 -0
face-parsing/README.md +165 -0
face-parsing/config.json +111 -0
face-parsing/demo.png +3 -0
face-parsing/model.safetensors +3 -0
face-parsing/onnx/model.onnx +3 -0
face-parsing/onnx/model_quantized.onnx +3 -0
face-parsing/preprocessor_config.json +23 -0
face-parsing/quantize_config.json +33 -0
models/unet_3d.py +727 -0
models/unet_3d_blocks.py +1121 -0
pretrained_weights/sd-image-variations-diffusers/.gitattributes +32 -0
pretrained_weights/sd-image-variations-diffusers/README.md +226 -0
pretrained_weights/sd-image-variations-diffusers/alias-montage.jpg +3 -0
pretrained_weights/sd-image-variations-diffusers/default-montage.jpg +3 -0
pretrained_weights/sd-image-variations-diffusers/earring.jpg +3 -0
pretrained_weights/sd-image-variations-diffusers/feature_extractor/preprocessor_config.json +28 -0
pretrained_weights/sd-image-variations-diffusers/image_encoder/config.json +23 -0
pretrained_weights/sd-image-variations-diffusers/image_encoder/pytorch_model.bin +3 -0
pretrained_weights/sd-image-variations-diffusers/inputs.jpg +0 -0
pretrained_weights/sd-image-variations-diffusers/model_index.json +29 -0
pretrained_weights/sd-image-variations-diffusers/safety_checker/config.json +181 -0
pretrained_weights/sd-image-variations-diffusers/scheduler/scheduler_config.json +13 -0
pretrained_weights/sd-image-variations-diffusers/unet/config.json +40 -0
pretrained_weights/sd-image-variations-diffusers/unet/diffusion_pytorch_model.bin +3 -0
pretrained_weights/sd-image-variations-diffusers/v1-montage.jpg +3 -0
pretrained_weights/sd-image-variations-diffusers/v2-montage.jpg +3 -0
pretrained_weights/sd-image-variations-diffusers/vae/config.json +30 -0
pretrained_weights/sd-image-variations-diffusers/vae/diffusion_pytorch_model.bin +3 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/.gitattributes +36 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/LICENSE.md +58 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/README.md +99 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/comparison.png +3 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/feature_extractor/preprocessor_config.json +28 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/image_encoder/config.json +23 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/model_index.json +25 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/output_tile.gif +3 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/scheduler/scheduler_config.json +20 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/svd_xt.safetensors +3 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/svd_xt_image_decoder.safetensors +3 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/unet/config.json +38 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/unet/diffusion_pytorch_model.fp16.safetensors +3 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/unet/diffusion_pytorch_model.safetensors +3 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/vae/config.json +24 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/vae/diffusion_pytorch_model.fp16.safetensors +3 -0
pretrained_weights/stable-video-diffusion-img2vid-xt/vae/diffusion_pytorch_model.safetensors +3 -0
pretrained_weights/xnemo_denoising_unet.pth +3 -0
pretrained_weights/xnemo_motion_encoder.pth +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+face-parsing/demo.png filter=lfs diff=lfs merge=lfs -text
+pretrained_weights/sd-image-variations-diffusers/alias-montage.jpg filter=lfs diff=lfs merge=lfs -text
+pretrained_weights/sd-image-variations-diffusers/default-montage.jpg filter=lfs diff=lfs merge=lfs -text
+pretrained_weights/sd-image-variations-diffusers/earring.jpg filter=lfs diff=lfs merge=lfs -text
+pretrained_weights/sd-image-variations-diffusers/v1-montage.jpg filter=lfs diff=lfs merge=lfs -text
+pretrained_weights/sd-image-variations-diffusers/v2-montage.jpg filter=lfs diff=lfs merge=lfs -text
+pretrained_weights/stable-video-diffusion-img2vid-xt/comparison.png filter=lfs diff=lfs merge=lfs -text
+pretrained_weights/stable-video-diffusion-img2vid-xt/output_tile.gif filter=lfs diff=lfs merge=lfs -text

blaze_face_short_range.tflite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4578f35940bf5a1a655214a1cce5cab13eba73c1297cd78e1a04c2380b0152f
+size 229746

face-parsing/.gitattributes ADDED Viewed

	@@ -0,0 +1,28 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text

face-parsing/README.md ADDED Viewed

	@@ -0,0 +1,165 @@

+---
+language: en
+library_name: transformers
+tags:
+  - vision
+  - image-segmentation
+  - nvidia/mit-b5
+  - transformers.js
+  - onnx
+datasets:
+  - celebamaskhq
+---
+# Face Parsing
+![example image and output](demo.png)
+[Semantic segmentation](https://huggingface.co/docs/transformers/tasks/semantic_segmentation) model fine-tuned from [nvidia/mit-b5](https://huggingface.co/nvidia/mit-b5) with [CelebAMask-HQ](https://github.com/switchablenorms/CelebAMask-HQ) for face parsing. For additional options, see the Transformers [Segformer docs](https://huggingface.co/docs/transformers/model_doc/segformer).
+> ONNX model for web inference contributed by [Xenova](https://huggingface.co/Xenova).
+## Usage in Python
+Exhaustive list of labels can be extracted from [config.json](https://huggingface.co/jonathandinu/face-parsing/blob/65972ac96180b397f86fda0980bbe68e6ee01b8f/config.json#L30).
+| id  | label      | note              |
+| :-: | :--------- | :---------------- |
+|  0  | background |                   |
+|  1  | skin       |                   |
+|  2  | nose       |                   |
+|  3  | eye_g      | eyeglasses        |
+|  4  | l_eye      | left eye          |
+|  5  | r_eye      | right eye         |
+|  6  | l_brow     | left eyebrow      |
+|  7  | r_brow     | right eyebrow     |
+|  8  | l_ear      | left ear          |
+|  9  | r_ear      | right ear         |
+| 10  | mouth      | area between lips |
+| 11  | u_lip      | upper lip         |
+| 12  | l_lip      | lower lip         |
+| 13  | hair       |                   |
+| 14  | hat        |                   |
+| 15  | ear_r      | earring           |
+| 16  | neck_l     | necklace          |
+| 17  | neck       |                   |
+| 18  | cloth      | clothing          |
+```python
+import torch
+from torch import nn
+from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
+from PIL import Image
+import matplotlib.pyplot as plt
+import requests
+# convenience expression for automatically determining device
+device = (
+    "cuda"
+    # Device for NVIDIA or AMD GPUs
+    if torch.cuda.is_available()
+    else "mps"
+    # Device for Apple Silicon (Metal Performance Shaders)
+    if torch.backends.mps.is_available()
+    else "cpu"
+)
+# load models
+image_processor = SegformerImageProcessor.from_pretrained("jonathandinu/face-parsing")
+model = SegformerForSemanticSegmentation.from_pretrained("jonathandinu/face-parsing")
+model.to(device)
+# expects a PIL.Image or torch.Tensor
+url = "https://images.unsplash.com/photo-1539571696357-5a69c17a67c6"
+image = Image.open(requests.get(url, stream=True).raw)
+# run inference on image
+inputs = image_processor(images=image, return_tensors="pt").to(device)
+outputs = model(**inputs)
+logits = outputs.logits  # shape (batch_size, num_labels, ~height/4, ~width/4)
+# resize output to match input image dimensions
+upsampled_logits = nn.functional.interpolate(logits,
+                size=image.size[::-1], # H x W
+                mode='bilinear',
+                align_corners=False)
+# get label masks
+labels = upsampled_logits.argmax(dim=1)[0]
+# move to CPU to visualize in matplotlib
+labels_viz = labels.cpu().numpy()
+plt.imshow(labels_viz)
+plt.show()
+```
+## Usage in the browser (Transformers.js)
+```js
+import {
+  pipeline,
+  env,
+} from "https://cdn.jsdelivr.net/npm/@xenova/transformers@2.14.0";
+// important to prevent errors since the model files are likely remote on HF hub
+env.allowLocalModels = false;
+// instantiate image segmentation pipeline with pretrained face parsing model
+model = await pipeline("image-segmentation", "jonathandinu/face-parsing");
+// async inference since it could take a few seconds
+const output = await model(url);
+// each label is a separate mask object
+// [
+//   { score: null, label: 'background', mask: transformers.js RawImage { ... }}
+//   { score: null, label: 'hair', mask: transformers.js RawImage { ... }}
+//    ...
+// ]
+for (const m of output) {
+  print(`Found ${m.label}`);
+  m.mask.save(`${m.label}.png`);
+}
+```
+### p5.js
+Since [p5.js](https://p5js.org/) uses an animation loop abstraction, we need to take care loading the model and making predictions.
+```js
+// ...
+// asynchronously load transformers.js and instantiate model
+async function preload() {
+  // load transformers.js library with a dynamic import
+  const { pipeline, env } = await import(
+    "https://cdn.jsdelivr.net/npm/@xenova/transformers@2.14.0"
+  );
+  // important to prevent errors since the model files are remote on HF hub
+  env.allowLocalModels = false;
+  // instantiate image segmentation pipeline with pretrained face parsing model
+  model = await pipeline("image-segmentation", "jonathandinu/face-parsing");
+  print("face-parsing model loaded");
+}
+// ...
+```
+[full p5.js example](https://editor.p5js.org/jonathan.ai/sketches/wZn15Dvgh)
+### Model Description
+- **Developed by:** [Jonathan Dinu](https://twitter.com/jonathandinu)
+- **Model type:** Transformer-based semantic segmentation image model
+- **License:** non-commercial research and educational purposes
+- **Resources for more information:** Transformers docs on [Segformer](https://huggingface.co/docs/transformers/model_doc/segformer) and/or the [original research paper](https://arxiv.org/abs/2105.15203).
+## Limitations and Bias
+### Bias
+While the capabilities of computer vision models are impressive, they can also reinforce or exacerbate social biases. The [CelebAMask-HQ](https://github.com/switchablenorms/CelebAMask-HQ) dataset used for fine-tuning is large but not necessarily perfectly diverse or representative. Also, they are images of.... just celebrities.

face-parsing/config.json ADDED Viewed

	@@ -0,0 +1,111 @@

+{
+  "_name_or_path": "jonathandinu/face-parsing",
+  "architectures": [
+    "SegformerForSemanticSegmentation"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "classifier_dropout_prob": 0.1,
+  "decoder_hidden_size": 768,
+  "depths": [
+    3,
+    6,
+    40,
+    3
+  ],
+  "downsampling_rates": [
+    1,
+    4,
+    8,
+    16
+  ],
+  "drop_path_rate": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_sizes": [
+    64,
+    128,
+    320,
+    512
+  ],
+  "id2label": {
+    "0": "background",
+    "1": "skin",
+    "2": "nose",
+    "3": "eye_g",
+    "4": "l_eye",
+    "5": "r_eye",
+    "6": "l_brow",
+    "7": "r_brow",
+    "8": "l_ear",
+    "9": "r_ear",
+    "10": "mouth",
+    "11": "u_lip",
+    "12": "l_lip",
+    "13": "hair",
+    "14": "hat",
+    "15": "ear_r",
+    "16": "neck_l",
+    "17": "neck",
+    "18": "cloth"
+  },
+  "image_size": 224,
+  "initializer_range": 0.02,
+  "label2id": {
+    "background": 0,
+    "skin": 1,
+    "nose": 2,
+    "eye_g": 3,
+    "l_eye": 4,
+    "r_eye": 5,
+    "l_brow": 6,
+    "r_brow": 7,
+    "l_ear": 8,
+    "r_ear": 9,
+    "mouth": 10,
+    "u_lip": 11,
+    "l_lip": 12,
+    "hair": 13,
+    "hat": 14,
+    "ear_r": 15,
+    "neck_l": 16,
+    "neck": 17,
+    "cloth": 18
+  },
+  "layer_norm_eps": 1e-06,
+  "mlp_ratios": [
+    4,
+    4,
+    4,
+    4
+  ],
+  "model_type": "segformer",
+  "num_attention_heads": [
+    1,
+    2,
+    5,
+    8
+  ],
+  "num_channels": 3,
+  "num_encoder_blocks": 4,
+  "patch_sizes": [
+    7,
+    3,
+    3,
+    3
+  ],
+  "reshape_last_stage": true,
+  "semantic_loss_ignore_index": 255,
+  "sr_ratios": [
+    8,
+    4,
+    2,
+    1
+  ],
+  "strides": [
+    4,
+    2,
+    2,
+    2
+  ],
+  "transformers_version": "4.37.0.dev0"
+}

face-parsing/demo.png ADDED Viewed

Git LFS Details

SHA256: 31c74d29ab9e45f3401f404f7bfc09e2cf9f5825611f07dc20b25d00eb1cac8a
Pointer size: 131 Bytes
Size of remote file: 645 kB

face-parsing/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2bec795a8c243db71bd95be538fd62559003566466c71237e45c99b920f4b62
+size 338580732

face-parsing/onnx/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d4e67af60ff78184745ebf74cc15163c0adc27d45cdeba31e3a03d1096fb8c3
+size 340316611

face-parsing/onnx/model_quantized.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bab9bfb3cb979f3098ac3b934b1641dbf87f835e0b03c2ca6d88dcf18c83d27
+size 89439678

face-parsing/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "do_normalize": true,
+  "do_reduce_labels": false,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "SegformerFeatureExtractor",
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 512,
+    "width": 512
+  }
+}

face-parsing/quantize_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+    "per_channel": true,
+    "reduce_range": true,
+    "per_model_config": {
+        "model": {
+            "op_types": [
+                "Unsqueeze",
+                "Shape",
+                "Transpose",
+                "Sqrt",
+                "Gather",
+                "Slice",
+                "Erf",
+                "Div",
+                "Reshape",
+                "Add",
+                "Cast",
+                "Sub",
+                "Concat",
+                "ReduceMean",
+                "Mul",
+                "Conv",
+                "Constant",
+                "Resize",
+                "Softmax",
+                "Pow",
+                "Relu",
+                "MatMul"
+            ],
+            "weight_type": "QUInt8"
+        }
+    }
+}

models/unet_3d.py ADDED Viewed

	@@ -0,0 +1,727 @@

+# *************************************************************************
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: Apache-2.0
+#
+# This file has been modified by ByteDance Ltd. and/or its affiliates.
+#
+# Original file was released under Aniportrait, with the full license text
+# available at https://github.com/Zejun-Yang/AniPortrait/blob/main/LICENSE.
+#
+# This modified file is released under the same license.
+# *************************************************************************
+from collections import OrderedDict
+from dataclasses import dataclass
+import pdb
+from os import PathLike
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention_processor import AttentionProcessor
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, BaseOutput, logging
+from safetensors.torch import load_file
+from .resnet import InflatedConv3d, InflatedGroupNorm
+from .unet_3d_blocks import UNetMidBlock3DCrossAttn, get_down_block, get_up_block
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    sample: torch.FloatTensor
+class UNet3DConditionModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        mid_block_type: str = "UNetMidBlock3DCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        use_inflated_groupnorm=False,
+        # Additional
+        use_motion_module=False,
+        use_temporal_module=False,
+        motion_module_resolutions=(1, 2, 4, 8),
+        motion_module_mid_block=False,
+        motion_module_decoder_only=False,
+        motion_module_type=None,
+        temporal_module_type=None,
+        motion_module_kwargs={},
+        temporal_module_kwargs={},
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # input
+        self.conv_in = InflatedConv3d(
+            in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1)
+        )
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            res = 2**i
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+                use_motion_module=use_motion_module
+                and (res in motion_module_resolutions)
+                and (not motion_module_decoder_only),
+                use_temporal_module=use_temporal_module
+                and (res in motion_module_resolutions)
+                and (not motion_module_decoder_only),
+                motion_module_type=motion_module_type,
+                temporal_module_type=temporal_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+                temporal_module_kwargs=temporal_module_kwargs
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock3DCrossAttn":
+            self.mid_block = UNetMidBlock3DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+                use_motion_module=use_motion_module and motion_module_mid_block,
+                use_temporal_module=use_temporal_module and motion_module_mid_block,
+                motion_module_type=motion_module_type,
+                temporal_module_type=temporal_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+                temporal_module_kwargs=temporal_module_kwargs,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the videos
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            res = 2 ** (3 - i)
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+                use_motion_module=use_motion_module
+                and (res in motion_module_resolutions),
+                use_temporal_module=use_temporal_module
+                and (res in motion_module_resolutions),
+                motion_module_type=motion_module_type,
+                temporal_module_type=temporal_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+                temporal_module_kwargs=temporal_module_kwargs,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if use_inflated_groupnorm:
+            self.conv_norm_out = InflatedGroupNorm(
+                num_channels=block_out_channels[0],
+                num_groups=norm_num_groups,
+                eps=norm_eps,
+            )
+        else:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0],
+                num_groups=norm_num_groups,
+                eps=norm_eps,
+            )
+        self.conv_act = nn.SiLU()
+        self.conv_out = InflatedConv3d(
+            block_out_channels[0], out_channels, kernel_size=3, padding=1
+        )
+    @property
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                if "temporal_transformer" not in sub_name:
+                    fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            if "temporal_transformer" not in name:
+                fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+        num_slicable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+        slice_size = (
+            num_slicable_layers * [slice_size]
+            if not isinstance(slice_size, list)
+            else slice_size
+        )
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(
+            module: torch.nn.Module, slice_size: List[int]
+        ):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def set_use_cross_frame_attention(self, value):
+        def fn_recursive_set_use_cf_att(module: torch.nn.Module, value):
+            if hasattr(module, "set_use_cross_frame_attention"):
+                module.set_use_cross_frame_attention(value)
+            for child in module.children():
+                fn_recursive_set_use_cf_att(child, value)
+        for module in self.children():
+            fn_recursive_set_use_cf_att(module, value)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                if "temporal_transformer" not in sub_name:
+                    fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            if "temporal_transformer" not in name:
+                fn_recursive_attn_processor(name, module, processor)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        pose_cond_fea = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        skip_mm: bool = False,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError(
+                    "class_labels should be provided when num_class_embeds > 0"
+                )
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # pre-process
+        sample = self.conv_in(sample)
+        if pose_cond_fea is not None:
+            sample = sample + pose_cond_fea[0]
+        # down
+        down_block_res_samples = (sample,)
+        block_count = 1
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "has_cross_attention")
+                and downsample_block.has_cross_attention
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    skip_mm=skip_mm,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    skip_mm=skip_mm,
+                )
+            if pose_cond_fea is not None:
+                sample = sample + pose_cond_fea[block_count]
+                block_count += 1
+            down_block_res_samples += res_samples
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = (
+                    down_block_res_sample + down_block_additional_residual
+                )
+                new_down_block_res_samples += (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # mid
+        sample = self.mid_block(
+            sample,
+            emb,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            skip_mm=skip_mm,
+        )
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+        # up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if (
+                hasattr(upsample_block, "has_cross_attention")
+                and upsample_block.has_cross_attention
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    skip_mm=skip_mm,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    encoder_hidden_states=encoder_hidden_states,
+                    skip_mm=skip_mm,
+                )
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained_2d(
+        cls,
+        pretrained_model_path: PathLike,
+        motion_module_path: PathLike,
+        subfolder=None,
+        unet_additional_kwargs=None,
+        mm_zero_proj_out=False,
+    ):
+        pretrained_model_path = Path(pretrained_model_path)
+        motion_module_path = Path(motion_module_path)
+        if subfolder is not None:
+            pretrained_model_path = pretrained_model_path.joinpath(subfolder)
+        logger.info(
+            f"loaded temporal unet's pretrained weights from {pretrained_model_path} ..."
+        )
+        config_file = pretrained_model_path / "config.json"
+        if not (config_file.exists() and config_file.is_file()):
+            raise RuntimeError(f"{config_file} does not exist or is not a file")
+        unet_config = cls.load_config(config_file)
+        unet_config["_class_name"] = cls.__name__
+        unet_config["down_block_types"] = [
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ]
+        unet_config["up_block_types"] = [
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ]
+        unet_config["mid_block_type"] = "UNetMidBlock3DCrossAttn"
+        model = cls.from_config(unet_config, **unet_additional_kwargs)
+        # load the vanilla weights
+        if pretrained_model_path.joinpath(SAFETENSORS_WEIGHTS_NAME).exists():
+            logger.debug(
+                f"loading safeTensors weights from {pretrained_model_path} ..."
+            )
+            state_dict = load_file(
+                pretrained_model_path.joinpath(SAFETENSORS_WEIGHTS_NAME), device="cpu"
+            )
+        elif pretrained_model_path.joinpath(WEIGHTS_NAME).exists():
+            logger.debug(f"loading weights from {pretrained_model_path} ...")
+            state_dict = torch.load(
+                pretrained_model_path.joinpath(WEIGHTS_NAME),
+                map_location="cpu",
+                weights_only=True,
+            )
+        else:
+            raise FileNotFoundError(f"no weights file found in {pretrained_model_path}")
+        # load the motion module weights
+        if motion_module_path.exists() and motion_module_path.is_file():
+            if motion_module_path.suffix.lower() in [".pth", ".pt", ".ckpt"]:
+                logger.info(f"Load motion module params from {motion_module_path}")
+                motion_state_dict = torch.load(
+                    motion_module_path, map_location="cpu", weights_only=True
+                )
+            elif motion_module_path.suffix.lower() == ".safetensors":
+                motion_state_dict = load_file(motion_module_path, device="cpu")
+            else:
+                raise RuntimeError(
+                    f"unknown file format for motion module weights: {motion_module_path.suffix}"
+                )
+            motion_state_dict = {
+                k.replace('motion_modules.', 'temporal_modules.'): v for k, v in motion_state_dict.items() if not "pos_encoder" in k
+            }
+            if mm_zero_proj_out:
+                logger.info(f"Zero initialize proj_out layers in motion module...")
+                new_motion_state_dict = OrderedDict()
+                for k in motion_state_dict:
+                    if "proj_out" in k:
+                        continue
+                    new_motion_state_dict[k] = motion_state_dict[k]
+                motion_state_dict = new_motion_state_dict
+            # merge the state dicts
+            state_dict.update(motion_state_dict)
+        # load the weights into the model
+        m, u = model.load_state_dict(state_dict, strict=False)
+        logger.debug(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+        params = [
+            p.numel() if "temporal_modules" in n else 0
+            for n, p in model.named_parameters()
+        ]
+        mm_params = [
+            p.numel() if "motion_modules" in n else 0
+            for n, p in model.named_parameters()
+        ]
+        logger.info(
+            f"Loaded {sum(mm_params) / 1e6}M-parameter motion module, Loaded {sum(params) / 1e6}M-parameter temporal module"
+        )
+        return model

models/unet_3d_blocks.py ADDED Viewed

	@@ -0,0 +1,1121 @@

+# *************************************************************************
+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
+# SPDX-License-Identifier: Apache-2.0
+#
+# This file has been modified by ByteDance Ltd. and/or its affiliates.
+#
+# Original file was released under Aniportrait, with the full license text
+# available at https://github.com/Zejun-Yang/AniPortrait/blob/main/LICENSE.
+#
+# This modified file is released under the same license.
+# *************************************************************************
+import pdb
+from typing import Dict, Optional
+import torch
+from torch import nn
+from src.models.motion_module import get_motion_module
+# from .motion_module import get_motion_module
+from src.models.resnet import Downsample3D, ResnetBlock3D, Upsample3D
+from .transformer_3d import Transformer3DModel
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    unet_use_cross_frame_attention=None,
+    unet_use_temporal_attention=None,
+    use_inflated_groupnorm=None,
+    use_motion_module=None,
+    motion_module_type=None,
+    motion_module_kwargs=None,
+    use_temporal_module=None,
+    temporal_module_type=None,
+    temporal_module_kwargs=None,
+):
+    down_block_type = (
+        down_block_type[7:]
+        if down_block_type.startswith("UNetRes")
+        else down_block_type
+    )
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_inflated_groupnorm=use_inflated_groupnorm,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+            use_temporal_module=use_temporal_module,
+            temporal_module_type=temporal_module_type,
+            temporal_module_kwargs=temporal_module_kwargs,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock3D"
+            )
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+            unet_use_temporal_attention=unet_use_temporal_attention,
+            use_inflated_groupnorm=use_inflated_groupnorm,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+            use_temporal_module=use_temporal_module,
+            temporal_module_type=temporal_module_type,
+            temporal_module_kwargs=temporal_module_kwargs,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    unet_use_cross_frame_attention=None,
+    unet_use_temporal_attention=None,
+    use_inflated_groupnorm=None,
+    use_motion_module=None,
+    motion_module_type=None,
+    motion_module_kwargs=None,
+    use_temporal_module=None,
+    temporal_module_type=None,
+    temporal_module_kwargs=None,
+):
+    up_block_type = (
+        up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    )
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_inflated_groupnorm=use_inflated_groupnorm,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+            use_temporal_module=use_temporal_module,
+            temporal_module_type=temporal_module_type,
+            temporal_module_kwargs=temporal_module_kwargs,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock3D"
+            )
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+            unet_use_temporal_attention=unet_use_temporal_attention,
+            use_inflated_groupnorm=use_inflated_groupnorm,
+            use_motion_module=use_motion_module,
+            motion_module_type=motion_module_type,
+            motion_module_kwargs=motion_module_kwargs,
+            use_temporal_module=use_temporal_module,
+            temporal_module_type=temporal_module_type,
+            temporal_module_kwargs=temporal_module_kwargs,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock3DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+        use_inflated_groupnorm=None,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+        use_temporal_module=None,
+        temporal_module_type=None,
+        temporal_module_kwargs=None,
+        **transformer_kwargs,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = (
+            resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        )
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_inflated_groupnorm=use_inflated_groupnorm,
+            )
+        ]
+        attentions = []
+        motion_modules = []
+        for _ in range(num_layers):
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                    **transformer_kwargs
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=in_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        self.temporal_modules = nn.ModuleList(
+            [
+                (
+                    get_motion_module(
+                        in_channels=in_channels,
+                        motion_module_type=temporal_module_type,
+                        motion_module_kwargs=temporal_module_kwargs,
+                    )
+                    if use_temporal_module
+                    else None
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        skip_mm=False,
+    ):
+        if isinstance(encoder_hidden_states, list):
+            encoder_hidden_states, motion_hidden_states = encoder_hidden_states
+        else:
+            motion_hidden_states = encoder_hidden_states
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet, motion_module, temporal_module in zip(
+            self.attentions, self.resnets[1:], self.motion_modules, self.temporal_modules
+        ):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+                if (motion_module is not None) and not skip_mm:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(motion_module),
+                        hidden_states,
+                        temb,
+                        motion_hidden_states,
+                    )
+                if (temporal_module is not None) and not skip_mm:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(temporal_module),
+                        hidden_states.requires_grad_(),
+                        temb,
+                        None,
+                    )
+                # hidden_states = (
+                #     temporal_module(hidden_states, temb, encoder_hidden_states=None)
+                #     if (temporal_module is not None) and not skip_mm
+                #     else hidden_states
+                # )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                ).sample
+                hidden_states = (
+                    motion_module(
+                        hidden_states, temb, encoder_hidden_states=motion_hidden_states
+                    )
+                    if (motion_module is not None) and not skip_mm
+                    else hidden_states
+                )
+                hidden_states = (
+                    temporal_module(hidden_states, temb, encoder_hidden_states=None, debug=True)
+                    if (temporal_module is not None) and not skip_mm
+                    else hidden_states
+                )
+                hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class CrossAttnDownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+        use_inflated_groupnorm=None,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+        use_temporal_module=None,
+        temporal_module_type=None,
+        temporal_module_kwargs=None,
+        **transformer_kwargs,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                    **transformer_kwargs,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        self.temporal_modules = nn.ModuleList(
+            [
+                (
+                    get_motion_module(
+                        in_channels=out_channels,
+                        motion_module_type=temporal_module_type,
+                        motion_module_kwargs=temporal_module_kwargs,
+                    )
+                    if use_temporal_module
+                    else None
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample3D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        temb=None,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        skip_mm=False
+    ):
+        if isinstance(encoder_hidden_states, list):
+            encoder_hidden_states, motion_hidden_states = encoder_hidden_states
+        else:
+            motion_hidden_states = encoder_hidden_states
+        output_states = ()
+        for i, (resnet, attn, motion_module, temporal_module) in enumerate(
+            zip(self.resnets, self.attentions, self.motion_modules, self.temporal_modules)
+        ):
+            # self.gradient_checkpointing = False
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+                # add motion module
+                if (motion_module is not None) and not skip_mm:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(motion_module),
+                        hidden_states,
+                        temb,
+                        motion_hidden_states,
+                    )
+                if (temporal_module is not None) and not skip_mm:
+                    # hidden_states = torch.utils.checkpoint.checkpoint(
+                    #     create_custom_forward(temporal_module),
+                    #     hidden_states.requires_grad_(),
+                    #     temb,
+                    #     None,
+                    # )
+                    hidden_states = (
+                        temporal_module(hidden_states, temb, encoder_hidden_states=None)
+                        if (temporal_module is not None) and not skip_mm
+                        else hidden_states
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                ).sample
+                # add motion module
+                hidden_states = (
+                    motion_module(
+                        hidden_states, temb, encoder_hidden_states=motion_hidden_states
+                    )
+                    if (motion_module is not None) and not skip_mm
+                    else hidden_states
+                )
+                hidden_states = (
+                    temporal_module(hidden_states, temb, encoder_hidden_states=None, debug=True)
+                    if (temporal_module is not None) and not skip_mm
+                    else hidden_states
+                )
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+        use_inflated_groupnorm=None,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+        use_temporal_module=None,
+        temporal_module_type=None,
+        temporal_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+        # use_motion_module = False
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        self.temporal_modules = nn.ModuleList(
+            [
+                (
+                    get_motion_module(
+                        in_channels=out_channels,
+                        motion_module_type=temporal_module_type,
+                        motion_module_kwargs=temporal_module_kwargs,
+                    )
+                    if use_temporal_module
+                    else None
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample3D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, skip_mm=False):
+        output_states = ()
+        if isinstance(encoder_hidden_states, list):
+            encoder_hidden_states, motion_hidden_states = encoder_hidden_states
+        else:
+            motion_hidden_states = encoder_hidden_states
+        for resnet, motion_module, temporal_module in zip(
+            self.resnets, self.motion_modules, self.temporal_modules
+        ):
+            # print(f"DownBlock3D {self.gradient_checkpointing = }")
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb
+                )
+                if (motion_module is not None) and not skip_mm:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(motion_module),
+                        hidden_states,
+                        temb,
+                        motion_hidden_states,
+                    )
+                if (temporal_module is not None) and not skip_mm:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(temporal_module),
+                        hidden_states.requires_grad_(),
+                        temb,
+                        None,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                # add motion module
+                hidden_states = (
+                    motion_module(
+                        hidden_states, temb, encoder_hidden_states=motion_hidden_states
+                    )
+                    if (motion_module is not None) and not skip_mm
+                    else hidden_states
+                )
+                hidden_states = (
+                    temporal_module(
+                        hidden_states, temb, encoder_hidden_states=None, debug=True
+                    )
+                    if (temporal_module is not None) and not skip_mm
+                    else hidden_states
+                )
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnUpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+        use_motion_module=None,
+        use_inflated_groupnorm=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+        use_temporal_module=None,
+        temporal_module_type=None,
+        temporal_module_kwargs=None,
+        **transformer_kwargs,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                    **transformer_kwargs,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        self.temporal_modules = nn.ModuleList(
+            [
+                (
+                    get_motion_module(
+                        in_channels=out_channels,
+                        motion_module_type=temporal_module_type,
+                        motion_module_kwargs=temporal_module_kwargs,
+                    )
+                    if use_temporal_module
+                    else None
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        attention_mask=None,
+        skip_mm=False,
+    ):
+        if isinstance(encoder_hidden_states, list):
+            encoder_hidden_states, motion_hidden_states = encoder_hidden_states
+        else:
+            motion_hidden_states = encoder_hidden_states
+        for i, (resnet, attn, motion_module, temporal_module) in enumerate(
+            zip(self.resnets, self.attentions, self.motion_modules, self.temporal_modules)
+        ):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb
+                )
+                # hidden_states = attn(
+                #     hidden_states,
+                #     encoder_hidden_states=encoder_hidden_states,
+                # ).sample
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+                if (motion_module is not None) and not skip_mm:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(motion_module),
+                        hidden_states,
+                        temb,
+                        motion_hidden_states,
+                    )
+                if (temporal_module is not None) and not skip_mm:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(temporal_module),
+                        hidden_states.requires_grad_(),
+                        temb,
+                        None,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                ).sample
+                # add motion module
+                hidden_states = (
+                    motion_module(
+                        hidden_states, temb, encoder_hidden_states=motion_hidden_states
+                    )
+                    if (motion_module is not None) and not skip_mm
+                    else hidden_states
+                )
+                # add temporal_module
+                hidden_states = (
+                    temporal_module(hidden_states, temb, encoder_hidden_states=None, debug=True)
+                    if (temporal_module is not None) and not skip_mm
+                    else hidden_states
+                )
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        use_inflated_groupnorm=None,
+        use_motion_module=None,
+        motion_module_type=None,
+        motion_module_kwargs=None,
+        use_temporal_module=None,
+        temporal_module_type=None,
+        temporal_module_kwargs=None,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+        # use_motion_module = False
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    use_inflated_groupnorm=use_inflated_groupnorm,
+                )
+            )
+            motion_modules.append(
+                get_motion_module(
+                    in_channels=out_channels,
+                    motion_module_type=motion_module_type,
+                    motion_module_kwargs=motion_module_kwargs,
+                )
+                if use_motion_module
+                else None
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+        self.temporal_modules = nn.ModuleList(
+            [
+                (
+                    get_motion_module(
+                        in_channels=out_channels,
+                        motion_module_type=temporal_module_type,
+                        motion_module_kwargs=temporal_module_kwargs,
+                    )
+                    if use_temporal_module
+                    else None
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample3D(out_channels, use_conv=True, out_channels=out_channels)]
+            )
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        upsample_size=None,
+        encoder_hidden_states=None,
+        skip_mm=False,
+    ):
+        if isinstance(encoder_hidden_states, list):
+            encoder_hidden_states, motion_hidden_states = encoder_hidden_states
+        else:
+            motion_hidden_states = encoder_hidden_states
+        for resnet, motion_module, temporal_module in zip(self.resnets, self.motion_modules, self.temporal_modules):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            # print(f"UpBlock3D {self.gradient_checkpointing = }")
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb
+                )
+                if (motion_module is not None) and not skip_mm:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(motion_module),
+                        hidden_states,
+                        temb,
+                        motion_hidden_states,
+                    )
+                if (temporal_module is not None) and not skip_mm:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(temporal_module),
+                        hidden_states.requires_grad_(),
+                        temb,
+                        None,
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = (
+                    motion_module(
+                        hidden_states, temb, encoder_hidden_states=motion_hidden_states
+                    )
+                    if (motion_module is not None) and not skip_mm
+                    else hidden_states
+                )
+                hidden_states = (
+                    temporal_module(hidden_states, temb, encoder_hidden_states=None, debug=True)
+                    if (temporal_module is not None) and not skip_mm
+                    else hidden_states
+                )
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states

pretrained_weights/sd-image-variations-diffusers/.gitattributes ADDED Viewed

	@@ -0,0 +1,32 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

pretrained_weights/sd-image-variations-diffusers/README.md ADDED Viewed

	@@ -0,0 +1,226 @@

+---
+thumbnail: "https://repository-images.githubusercontent.com/523487884/fdb03a69-8353-4387-b5fc-0d85f888a63f"
+datasets:
+- ChristophSchuhmann/improved_aesthetics_6plus
+license: creativeml-openrail-m
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- image-to-image
+---
+# Stable Diffusion Image Variations Model Card
+📣 V2 model released, and blurriness issues fixed! 📣
+🧨🎉 Image Variations is now natively supported in 🤗 Diffusers! 🎉🧨
+![](https://raw.githubusercontent.com/justinpinkney/stable-diffusion/main/assets/im-vars-thin.jpg)
+## Version 2
+This version of Stable Diffusion has been fine tuned from [CompVis/stable-diffusion-v1-4-original](https://huggingface.co/CompVis/stable-diffusion-v-1-4-original) to accept CLIP image embedding rather than text embeddings. This allows the creation of "image variations" similar to DALLE-2 using Stable Diffusion. This version of the weights has been ported to huggingface Diffusers, to use this with the Diffusers library requires the [Lambda Diffusers repo](https://github.com/LambdaLabsML/lambda-diffusers).
+This model was trained in two stages and longer than the original variations model and gives better image quality and better CLIP rated similarity compared to the original version
+See training details and v1 vs v2 comparison below.
+## Example
+Make sure you are using a version of Diffusers >=0.8.0 (for older version see the old instructions at the bottom of this model card)
+```python
+from diffusers import StableDiffusionImageVariationPipeline
+from PIL import Image
+device = "cuda:0"
+sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+  "lambdalabs/sd-image-variations-diffusers",
+  revision="v2.0",
+  )
+sd_pipe = sd_pipe.to(device)
+im = Image.open("path/to/image.jpg")
+tform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize(
+        (224, 224),
+        interpolation=transforms.InterpolationMode.BICUBIC,
+        antialias=False,
+        ),
+    transforms.Normalize(
+      [0.48145466, 0.4578275, 0.40821073],
+      [0.26862954, 0.26130258, 0.27577711]),
+])
+inp = tform(im).to(device).unsqueeze(0)
+out = sd_pipe(inp, guidance_scale=3)
+out["images"][0].save("result.jpg")
+```
+### The importance of resizing correctly... (or not)
+Note that due a bit of an oversight during training, the model expects resized images without anti-aliasing. This turns out to make a big difference and is important to do the resizing the same way during inference. When passing a PIL image to the Diffusers pipeline antialiasing will be applied during resize, so it's better to input a tensor which you have prepared manually according to the transfrom in the example above!
+Here are examples of images generated without (top) and with (bottom) anti-aliasing during resize. (Input is [this image](https://github.com/SHI-Labs/Versatile-Diffusion/blob/master/assets/ghibli.jpg))
+![](alias-montage.jpg)
+![](default-montage.jpg)
+### V1 vs V2
+Here's an example of V1 vs V2, version two was trained more carefully and for longer, see the details below. V2-top vs V1-bottom
+![](v2-montage.jpg)
+![](v1-montage.jpg)
+Input images:
+![](inputs.jpg)
+One important thing to note is that due to the longer training V2 appears to have memorised some common images from the training data, e.g. now the previous example of the Girl with a Pearl Earring almosts perfectly reproduce the original rather than creating variations. You can always use v1 by specifiying `revision="v1.0"`.
+v2 output for girl with a pearl earing as input (guidance scale=3)
+![](earring.jpg)
+# Training
+**Training Procedure**
+This model is fine tuned from Stable Diffusion v1-3 where the text encoder has been replaced with an image encoder. The training procedure is the same as for Stable Diffusion except for the fact that images are encoded through a ViT-L/14 image-encoder including the final projection layer to the CLIP shared embedding space. The model was trained on LAION improved aesthetics 6plus.
+- **Hardware:** 8 x A100-40GB GPUs (provided by [Lambda GPU Cloud](https://lambdalabs.com/service/gpu-cloud))
+- **Optimizer:** AdamW
+- **Stage 1** - Fine tune only CrossAttention layer weights from Stable Diffusion v1.4 model
+  - **Steps**: 46,000
+  - **Batch:** batch size=4, GPUs=8, Gradient Accumulations=4. Total batch size=128
+  - **Learning rate:** warmup to 1e-5 for 10,000 steps and then kept constant
+- **Stage 2** - Resume from Stage 1 training the whole unet
+  - **Steps**: 50,000
+  - **Batch:** batch size=4, GPUs=8, Gradient Accumulations=5. Total batch size=160
+  - **Learning rate:** warmup to 1e-5 for 5,000 steps and then kept constant
+Training was done using a [modified version of the original Stable Diffusion training code](https://github.com/justinpinkney/stable-diffusion).
+# Uses
+_The following section is adapted from the [Stable Diffusion model card](https://huggingface.co/CompVis/stable-diffusion-v1-4)_
+## Direct Use
+The model is intended for research purposes only. Possible research areas and
+tasks include
+- Safe deployment of models which have the potential to generate harmful content.
+- Probing and understanding the limitations and biases of generative models.
+- Generation of artworks and use in design and other artistic processes.
+- Applications in educational or creative tools.
+- Research on generative models.
+Excluded uses are described below.
+ ### Misuse, Malicious Use, and Out-of-Scope Use
+The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
+#### Out-of-Scope Use
+The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
+#### Misuse and Malicious Use
+Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
+- Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
+- Intentionally promoting or propagating discriminatory content or harmful stereotypes.
+- Impersonating individuals without their consent.
+- Sexual content without consent of the people who might see it.
+- Mis- and disinformation
+- Representations of egregious violence and gore
+- Sharing of copyrighted or licensed material in violation of its terms of use.
+- Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
+## Limitations and Bias
+### Limitations
+- The model does not achieve perfect photorealism
+- The model cannot render legible text
+- The model does not perform well on more difficult tasks which involve compositionality, such as rendering an image corresponding to “A red cube on top of a blue sphere”
+- Faces and people in general may not be generated properly.
+- The model was trained mainly with English captions and will not work as well in other languages.
+- The autoencoding part of the model is lossy
+- The model was trained on a large-scale dataset
+  [LAION-5B](https://laion.ai/blog/laion-5b/) which contains adult material
+  and is not fit for product use without additional safety mechanisms and
+  considerations.
+- No additional measures were used to deduplicate the dataset. As a result, we observe some degree of memorization for images that are duplicated in the training data.
+  The training data can be searched at [https://rom1504.github.io/clip-retrieval/](https://rom1504.github.io/clip-retrieval/) to possibly assist in the detection of memorized images.
+### Bias
+While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
+Stable Diffusion v1 was trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/),
+which consists of images that are primarily limited to English descriptions.
+Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for.
+This affects the overall output of the model, as white and western cultures are often set as the default. Further, the
+ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts.
+### Safety Module
+The intended use of this model is with the [Safety Checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) in Diffusers.
+This checker works by checking model outputs against known hard-coded NSFW concepts.
+The concepts are intentionally hidden to reduce the likelihood of reverse-engineering this filter.
+Specifically, the checker compares the class probability of harmful concepts in the embedding space of the `CLIPModel` *after generation* of the images.
+The concepts are passed into the model with the generated image and compared to a hand-engineered weight for each NSFW concept.
+## Old instructions
+If you are using a diffusers version <0.8.0 there is no `StableDiffusionImageVariationPipeline`,
+in this case you need to use an older revision (`2ddbd90b14bc5892c19925b15185e561bc8e5d0a`) in conjunction with the lambda-diffusers repo:
+First clone [Lambda Diffusers](https://github.com/LambdaLabsML/lambda-diffusers) and install any requirements (in a virtual environment in the example below):
+```bash
+git clone https://github.com/LambdaLabsML/lambda-diffusers.git
+cd lambda-diffusers
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+Then run the following python code:
+```python
+from pathlib import Path
+from lambda_diffusers import StableDiffusionImageEmbedPipeline
+from PIL import Image
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+pipe = StableDiffusionImageEmbedPipeline.from_pretrained(
+"lambdalabs/sd-image-variations-diffusers",
+revision="2ddbd90b14bc5892c19925b15185e561bc8e5d0a",
+)
+pipe = pipe.to(device)
+im = Image.open("your/input/image/here.jpg")
+num_samples = 4
+image = pipe(num_samples*[im], guidance_scale=3.0)
+image = image["sample"]
+base_path = Path("outputs/im2im")
+base_path.mkdir(exist_ok=True, parents=True)
+for idx, im in enumerate(image):
+    im.save(base_path/f"{idx:06}.jpg")
+```
+*This model card was written by: Justin Pinkney and is based on the [Stable Diffusion model card](https://huggingface.co/CompVis/stable-diffusion-v1-4).*

pretrained_weights/sd-image-variations-diffusers/alias-montage.jpg ADDED Viewed

Git LFS Details

SHA256: 785972e472ca53fdc631cbc5cc6e735448c513638adce9049d1963e401a05c7a
Pointer size: 131 Bytes
Size of remote file: 151 kB

pretrained_weights/sd-image-variations-diffusers/default-montage.jpg ADDED Viewed

Git LFS Details

SHA256: bd42b0ee127f0f4df5912eca1f4d479150c9020b2c6136c19c633fa983294aa7
Pointer size: 131 Bytes
Size of remote file: 148 kB

pretrained_weights/sd-image-variations-diffusers/earring.jpg ADDED Viewed

Git LFS Details

SHA256: 87b8a0583e481839a98d27068370979b36a6e2bc95aa79ffeeb89cd324d47bb6
Pointer size: 131 Bytes
Size of remote file: 113 kB

pretrained_weights/sd-image-variations-diffusers/feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

pretrained_weights/sd-image-variations-diffusers/image_encoder/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "/home/jpinkney/.cache/huggingface/diffusers/models--lambdalabs--sd-image-variations-diffusers/snapshots/ca6f97f838ae1b5bf764f31363a21f388f4d8f3e/image_encoder",
+  "architectures": [
+    "CLIPVisionModelWithProjection"
+  ],
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dim": 768,
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1"
+}

pretrained_weights/sd-image-variations-diffusers/image_encoder/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89d2aa29b5fdf64f3ad4f45fb4227ea98bc45156bbae673b85be1af7783dbabb
+size 1215993967

pretrained_weights/sd-image-variations-diffusers/inputs.jpg ADDED Viewed

pretrained_weights/sd-image-variations-diffusers/model_index.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_class_name": "StableDiffusionImageVariationPipeline",
+  "_diffusers_version": "0.9.0",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "image_encoder": [
+    "transformers",
+    "CLIPVisionModelWithProjection"
+  ],
+  "requires_safety_checker": true,
+  "safety_checker": [
+    "stable_diffusion",
+    "StableDiffusionSafetyChecker"
+  ],
+  "scheduler": [
+    "diffusers",
+    "PNDMScheduler"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

pretrained_weights/sd-image-variations-diffusers/safety_checker/config.json ADDED Viewed

	@@ -0,0 +1,181 @@

+{
+  "_commit_hash": "ca6f97f838ae1b5bf764f31363a21f388f4d8f3e",
+  "_name_or_path": "/home/jpinkney/.cache/huggingface/diffusers/models--lambdalabs--sd-image-variations-diffusers/snapshots/ca6f97f838ae1b5bf764f31363a21f388f4d8f3e/safety_checker",
+  "architectures": [
+    "StableDiffusionSafetyChecker"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 768,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.25.1",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "text_config_dict": {
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.25.1",
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vision_config_dict": {
+    "hidden_size": 1024,
+    "intermediate_size": 4096,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "patch_size": 14
+  }
+}

pretrained_weights/sd-image-variations-diffusers/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_class_name": "PNDMScheduler",
+  "_diffusers_version": "0.9.0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "trained_betas": null
+}

pretrained_weights/sd-image-variations-diffusers/unet/config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.9.0",
+  "act_fn": "silu",
+  "attention_head_dim": 8,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dual_cross_attention": false,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "sample_size": 64,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "use_linear_projection": false
+}

pretrained_weights/sd-image-variations-diffusers/unet/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee23e3368e4e7c0e4ef636ed61923609c97fcaa583f8bb416e3e0986d4a0cfc6
+size 3438354725

pretrained_weights/sd-image-variations-diffusers/v1-montage.jpg ADDED Viewed

Git LFS Details

SHA256: 607396d9a79e0649a898e46f5edfc2a37775eb02ee2d7977c9435db3f9f9db2f
Pointer size: 131 Bytes
Size of remote file: 613 kB

pretrained_weights/sd-image-variations-diffusers/v2-montage.jpg ADDED Viewed

Git LFS Details

SHA256: cb189ff32754768525afdc8c7f3fc99b9ab6747a8edd330297148b0332f48e71
Pointer size: 131 Bytes
Size of remote file: 570 kB

pretrained_weights/sd-image-variations-diffusers/vae/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.9.0",
+  "_name_or_path": "stabilityai/sd-vae-ft-mse",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 256,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

pretrained_weights/sd-image-variations-diffusers/vae/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc
+size 334707217

pretrained_weights/stable-video-diffusion-img2vid-xt/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+output_tile.gif filter=lfs diff=lfs merge=lfs -text

pretrained_weights/stable-video-diffusion-img2vid-xt/LICENSE.md ADDED Viewed

	@@ -0,0 +1,58 @@

+STABILITY AI COMMUNITY LICENSE AGREEMENT
+Last Updated: July 5, 2024
+1. INTRODUCTION
+This Agreement applies to any individual person or entity (“You”, “Your” or “Licensee”) that uses or distributes any portion or element of the Stability AI Materials  or Derivative Works thereof for any Research & Non-Commercial or Commercial purpose. Capitalized terms not otherwise defined herein are defined in Section V below.
+This Agreement is intended to allow research, non-commercial, and limited commercial uses of the Models free of charge. In order to ensure that certain limited commercial uses of the Models continue to be allowed, this Agreement  preserves free access to the Models for people or organizations  generating annual revenue of less than US $1,000,000 (or local currency equivalent).
+By clicking “I Accept”  or by using or distributing or using any portion or element of the Stability Materials or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement. If You are acting on behalf of a company, organization or other entity, then “You” includes you and that entity, and You agree that You: (i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and (ii) You agree to the terms of this Agreement on that entity’s behalf.
+2. RESEARCH & NON-COMMERCIAL USE LICENSE
+Subject to the terms of this Agreement, Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Research or Non-Commercial Purpose. “Research Purpose” means academic or scientific advancement, and in each case, is not primarily intended for commercial advantage or monetary compensation to You or others. “Non-Commercial Purpose” means any purpose other than a Research Purpose that is not primarily intended for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist) or evaluation and testing.
+3. COMMERCIAL USE LICENSE
+Subject to the terms of this Agreement (including the remainder of this Section III), Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Commercial Purpose. “Commercial Purpose” means any purpose other than a Research Purpose or Non-Commercial Purpose that is primarily intended for commercial advantage or monetary compensation to You or others, including but not limited to, (i) creating, modifying, or distributing Your product or service, including via a hosted service or application programming interface, and (ii) for Your business’s or organization’s internal operations.
+If You are using or distributing the Stability AI Materials for a Commercial Purpose, You must register with Stability AI at (https://stability.ai/community-license). If at any time You or Your Affiliate(s), either individually or in aggregate, generate more than USD $1,000,000 in annual revenue (or the equivalent thereof in Your local currency), regardless of whether that revenue is generated directly or indirectly from the Stability AI Materials or Derivative Works, any licenses granted to You under this Agreement shall terminate as of such date. You must request a license from Stability AI at (https://stability.ai/enterprise) , which Stability AI may grant to You in its sole discretion. If you receive Stability AI Materials, or any Derivative Works thereof, from a Licensee as part of an integrated end user product, then Section III of this Agreement will not apply to you.
+4. GENERAL TERMS
+Your Research, Non-Commercial, and Commercial License(s) under this Agreement are subject to the following terms.
+a.  Distribution & Attribution. If You distribute or make available the Stability AI Materials or a Derivative Work to a third party, or a product or service that uses any portion of them, You shall: (i) provide a copy of this Agreement to that third party, (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Community License, Copyright ©  Stability AI Ltd. All Rights Reserved”, and (iii) prominently display “Powered by Stability AI” on a related website, user interface, blogpost, about page, or product documentation.  If You create a Derivative Work, You may add your own attribution notice(s) to the “Notice” text file included with that Derivative Work, provided that You clearly indicate which attributions apply to the Stability AI Materials and state in the “Notice” text file that You changed the Stability AI Materials and how it was modified.
+b.  Use Restrictions. Your use of the Stability AI Materials and Derivative Works, including any output or results of the Stability AI Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control Laws and equivalent regulations) and adhere to the Documentation and Stability AI’s AUP, which is hereby incorporated by reference. Furthermore, You will not use the Stability AI Materials or Derivative Works, or any output or results of the Stability AI Materials or Derivative Works, to create or improve any foundational generative AI model (excluding the Models or Derivative Works).
+c.  Intellectual Property.
+(i) Trademark License.  No trademark licenses are granted under this Agreement, and in connection with the Stability AI Materials or Derivative Works, You may not use any name or mark owned by or associated with Stability AI or any of its Affiliates, except as required under Section IV(a) herein.
+(ii)  Ownership of Derivative Works.  As between You and Stability AI, You are the owner of Derivative Works You create, subject to Stability AI’s ownership of the Stability AI Materials and any Derivative Works made by or for Stability AI.
+(iii)  Ownership of Outputs. As between You and Stability AI, You own any outputs generated from the Models or Derivative Works to the extent permitted by applicable law.
+(iv)  Disputes.  If You or Your Affiliate(s) institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Stability AI Materials, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by You, then any licenses granted to You under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to Your use or distribution of the Stability AI Materials or Derivative Works in violation of this Agreement.
+(v)  Feedback.  From time to time, You may provide Stability AI with verbal and/or written suggestions, comments or other feedback related to Stability AI’s existing or prospective technology, products or services (collectively, “Feedback”). You are not obligated to provide Stability AI with Feedback, but to the extent that You do, You hereby grant Stability AI a perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive, worldwide right and license to exploit the Feedback in any manner without restriction. Your Feedback is provided “AS IS” and You make no warranties whatsoever about any Feedback.
+d.  Disclaimer Of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE STABILITY AI MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE STABILITY AI MATERIALS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE STABILITY AI MATERIALS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS.
+e.  Limitation Of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING.
+f.  Term And Termination. The term of this Agreement will commence upon Your acceptance of this Agreement or access to the Stability AI Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if You are in breach of any term or condition of this Agreement. Upon termination of this Agreement, You shall delete and cease use of any Stability AI Materials or Derivative Works. Section IV(d), (e), and (g) shall survive the termination of this Agreement.
+g.  Governing Law.  This Agreement will be governed by and constructed in accordance with the laws of the United States and the State of California without regard to choice of law principles, and the UN Convention on Contracts for International Sale of Goods does not apply to this Agreement.
+5. DEFINITIONS
+“Affiliate(s)” means any entity that directly or indirectly controls, is controlled by, or is under common control with the subject entity; for purposes of this definition, “control” means direct or indirect ownership or control of more than 50% of the voting interests of the subject entity.
+"Agreement" means this Stability AI Community License Agreement.
+“AUP” means the Stability AI Acceptable Use Policy available at (https://stability.ai/use-policy), as may be updated from time to time.
+"Derivative Work(s)” means (a) any derivative work of the Stability AI Materials as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model’s output, including “fine tune” and “low-rank adaptation” models derived from a Model or a Model’s output, but do not include the output of any Model.
+“Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software or Models.
+“Model(s)" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing listed on Stability’s Core Models Webpage available at (https://stability.ai/core-models), as may be updated from time to time.
+"Stability AI" or "we" means Stability AI Ltd. and its Affiliates.
+"Software" means Stability AI’s proprietary software made available under this Agreement now or in the future.
+“Stability AI Materials” means, collectively, Stability’s proprietary Models, Software and Documentation (and any portion or combination thereof) made available under this Agreement.
+“Trade Control Laws” means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.

pretrained_weights/stable-video-diffusion-img2vid-xt/README.md ADDED Viewed

	@@ -0,0 +1,99 @@

+---
+pipeline_tag: image-to-video
+license: other
+license_name: stable-video-diffusion-community
+license_link: LICENSE.md
+---
+# Stable Video Diffusion Image-to-Video Model Card
+<!-- Provide a quick summary of what the model is/does. -->
+![row01](output_tile.gif)
+Stable Video Diffusion (SVD) Image-to-Video is a diffusion model that takes in a still image as a conditioning frame, and generates a video from it.
+Please note: For commercial use, please refer to https://stability.ai/license.
+## Model Details
+### Model Description
+(SVD) Image-to-Video is a latent diffusion model trained to generate short video clips from an image conditioning.
+This model was trained to generate 25 frames at resolution 576x1024 given a context frame of the same size, finetuned from [SVD Image-to-Video [14 frames]](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid).
+We also finetune the widely used [f8-decoder](https://huggingface.co/docs/diffusers/api/models/autoencoderkl#loading-from-the-original-format) for temporal consistency.
+For convenience, we additionally provide the model with the
+standard frame-wise decoder [here](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/svd_xt_image_decoder.safetensors).
+- **Developed by:** Stability AI
+- **Funded by:** Stability AI
+- **Model type:** Generative image-to-video model
+- **Finetuned from model:** SVD Image-to-Video [14 frames]
+### Model Sources
+For research purposes, we recommend our `generative-models` Github repository (https://github.com/Stability-AI/generative-models),
+which implements the most popular diffusion frameworks (both training and inference).
+- **Repository:** https://github.com/Stability-AI/generative-models
+- **Paper:** https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets
+## Evaluation
+![comparison](comparison.png)
+The chart above evaluates user preference for SVD-Image-to-Video over [GEN-2](https://research.runwayml.com/gen2) and [PikaLabs](https://www.pika.art/).
+SVD-Image-to-Video is preferred by human voters in terms of video quality. For details on the user study, we refer to the [research paper](https://stability.ai/research/stable-video-diffusion-scaling-latent-video-diffusion-models-to-large-datasets)
+## Uses
+### Direct Use
+The model is intended for both non-commercial and commercial usage. You can use this model for non-commercial or research purposes under this [license](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt/blob/main/LICENSE.md). Possible research areas and tasks include
+- Research on generative models.
+- Safe deployment of models which have the potential to generate harmful content.
+- Probing and understanding the limitations and biases of generative models.
+- Generation of artworks and use in design and other artistic processes.
+- Applications in educational or creative tools.
+For commercial use, please refer to https://stability.ai/license.
+Excluded uses are described below.
+### Out-of-Scope Use
+The model was not trained to be factual or true representations of people or events,
+and therefore using the model to generate such content is out-of-scope for the abilities of this model.
+The model should not be used in any way that violates Stability AI's [Acceptable Use Policy](https://stability.ai/use-policy).
+## Limitations and Bias
+### Limitations
+- The generated videos are rather short (<= 4sec), and the model does not achieve perfect photorealism.
+- The model may generate videos without motion, or very slow camera pans.
+- The model cannot be controlled through text.
+- The model cannot render legible text.
+- Faces and people in general may not be generated properly.
+- The autoencoding part of the model is lossy.
+### Recommendations
+The model is intended for both non-commercial and commercial usage.
+## How to Get Started with the Model
+Check out https://github.com/Stability-AI/generative-models
+# Appendix:
+All considered potential data sources were included for final training, with none held out as the proposed data filtering methods described in the SVD paper handle the quality control/filtering of the dataset. With regards to safety/NSFW filtering, sources considered were either deemed safe or filtered with the in-house NSFW filters.
+No explicit human labor is involved in training data preparation. However, human evaluation for model outputs and quality was extensively used to evaluate model quality and performance. The evaluations were performed with third-party contractor platforms (Amazon Sagemaker, Amazon Mechanical Turk, Prolific) with fluent English-speaking contractors from various countries, primarily from the USA, UK, and Canada. Each worker was paid $12/hr for the time invested in the evaluation.
+No other third party was involved in the development of this model; the model was fully developed in-house at Stability AI.
+Training the SVD checkpoints required a total of approximately 200,000 A100 80GB hours. The majority of the training occurred on 48 * 8 A100s, while some stages took more/less than that. The resulting CO2 emission is ~19,000kg CO2 eq., and energy consumed is ~64000 kWh.
+The released checkpoints (SVD/SVD-XT) are image-to-video models that generate short videos/animations closely following the given input image. Since the model relies on an existing supplied image, the potential risks of disclosing specific material or novel unsafe content are minimal. This was also evaluated by third-party independent red-teaming services, which agree with our conclusion to a high degree of confidence (>90% in various areas of safety red-teaming). The external evaluations were also performed for trustworthiness, leading to >95% confidence in real, trustworthy videos.
+With the default settings at the time of release, SVD takes ~100s for generation, and SVD-XT takes ~180s on an A100 80GB card. Several optimizations to trade off quality / memory / speed can be done to perform faster inference or inference on lower VRAM cards.
+The information related to the model and its development process and usage protocols can be found in the GitHub repo, associated research paper, and HuggingFace model page/cards.
+The released model inference & demo code has image-level watermarking enabled by default, which can be used to detect the outputs. This is done via the imWatermark Python library.
+The model can be used to generate videos from static initial images. However, we prohibit unlawful, obscene, or misleading uses of the model consistent with the terms of our license and Acceptable Use Policy. For the open-weights release, our training data filtering mitigations alleviate this risk to some extent. These restrictions are explicitly enforced on user-facing interfaces at stablevideo.com, where a warning is issued. We do not take any responsibility for third-party interfaces. Submitting initial images that bypass input filters to tease out offensive or inappropriate content listed above is also prohibited. Safety filtering checks at stablevideo.com run on model inputs and outputs independently. More details on our user-facing interfaces can be found here: https://www.stablevideo.com/faq. Beyond the Acceptable Use Policy and other mitigations and conditions described here, the model is not subject to additional model behavior interventions of the type described in the Foundation Model Transparency Index.
+For stablevideo.com, we store preference data in the form of upvotes/downvotes on user-generated videos, and we have a pairwise ranker that runs while a user generates videos. This usage data is solely used for improving Stability AI’s future image/video models and services. No other third-party entities are given access to the usage data beyond Stability AI and maintainers of stablevideo.com.
+For usage statistics of SVD, we refer interested users to HuggingFace model download/usage statistics as a primary indicator. Third-party applications also have reported model usage statistics. We might also consider releasing aggregate usage statistics of stablevideo.com on reaching some milestones.

pretrained_weights/stable-video-diffusion-img2vid-xt/comparison.png ADDED Viewed

Git LFS Details

SHA256: 517263334c2011dd28f819b831ccc32a8dd676895429693477b936dc88600d15
Pointer size: 131 Bytes
Size of remote file: 147 kB

pretrained_weights/stable-video-diffusion-img2vid-xt/feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

pretrained_weights/stable-video-diffusion-img2vid-xt/image_encoder/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "/home/suraj_huggingface_co/.cache/huggingface/hub/models--diffusers--svd-xt/snapshots/9703ded20c957c340781ee710b75660826deb487/image_encoder",
+  "architectures": [
+    "CLIPVisionModelWithProjection"
+  ],
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_size": 1280,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 32,
+  "patch_size": 14,
+  "projection_dim": 1024,
+  "torch_dtype": "float16",
+  "transformers_version": "4.34.0.dev0"
+}

pretrained_weights/stable-video-diffusion-img2vid-xt/model_index.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_class_name": "StableVideoDiffusionPipeline",
+  "_diffusers_version": "0.24.0.dev0",
+  "_name_or_path": "diffusers/svd-xt",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "image_encoder": [
+    "transformers",
+    "CLIPVisionModelWithProjection"
+  ],
+  "scheduler": [
+    "diffusers",
+    "EulerDiscreteScheduler"
+  ],
+  "unet": [
+    "diffusers",
+    "UNetSpatioTemporalConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKLTemporalDecoder"
+  ]
+}

pretrained_weights/stable-video-diffusion-img2vid-xt/output_tile.gif ADDED Viewed

Git LFS Details

SHA256: 2340a9809e36fa9634633c7cc5fd256737c620ba47151726c85173512dc5c8ff
Pointer size: 133 Bytes
Size of remote file: 18.6 MB

pretrained_weights/stable-video-diffusion-img2vid-xt/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "_class_name": "EulerDiscreteScheduler",
+  "_diffusers_version": "0.24.0.dev0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "interpolation_type": "linear",
+  "num_train_timesteps": 1000,
+  "prediction_type": "v_prediction",
+  "set_alpha_to_one": false,
+  "sigma_max": 700.0,
+  "sigma_min": 0.002,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "timestep_spacing": "leading",
+  "timestep_type": "continuous",
+  "trained_betas": null,
+  "use_karras_sigmas": true
+}

pretrained_weights/stable-video-diffusion-img2vid-xt/svd_xt.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2652c23d64a1da5f14d55011b9b6dce55f2e72e395719f1cd1f8a079b00a451
+size 9559625980

pretrained_weights/stable-video-diffusion-img2vid-xt/svd_xt_image_decoder.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99aa889bf6d1ca28e026755b83ba37e3072ad79b45dd4c94fae14bee7482263b
+size 9503252964

pretrained_weights/stable-video-diffusion-img2vid-xt/unet/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "_class_name": "UNetSpatioTemporalConditionModel",
+  "_diffusers_version": "0.24.0.dev0",
+  "_name_or_path": "/home/suraj_huggingface_co/.cache/huggingface/hub/models--diffusers--svd-xt/snapshots/9703ded20c957c340781ee710b75660826deb487/unet",
+  "addition_time_embed_dim": 256,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "cross_attention_dim": 1024,
+  "down_block_types": [
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+    "DownBlockSpatioTemporal"
+  ],
+  "in_channels": 8,
+  "layers_per_block": 2,
+  "num_attention_heads": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "num_frames": 25,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": 768,
+  "sample_size": 96,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal"
+  ]
+}

pretrained_weights/stable-video-diffusion-img2vid-xt/unet/diffusion_pytorch_model.fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fbc02e90f37d422f5e3a4aeaee95f6629dc8c45ca211b951626e930daf2bddf
+size 3049435868

pretrained_weights/stable-video-diffusion-img2vid-xt/unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7783d82729af04f26ded4641a5952617fe331fc46add332fb9e47674fecc6ad7
+size 6098682464

pretrained_weights/stable-video-diffusion-img2vid-xt/vae/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_class_name": "AutoencoderKLTemporalDecoder",
+  "_diffusers_version": "0.24.0.dev0",
+  "_name_or_path": "/home/suraj_huggingface_co/.cache/huggingface/hub/models--diffusers--svd-xt/snapshots/9703ded20c957c340781ee710b75660826deb487/vae",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "out_channels": 3,
+  "sample_size": 768,
+  "scaling_factor": 0.18215
+}

pretrained_weights/stable-video-diffusion-img2vid-xt/vae/diffusion_pytorch_model.fp16.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af602cd0eb4ad6086ec94fbf1438dfb1be5ec9ac03fd0215640854e90d6463a3
+size 195531910

pretrained_weights/stable-video-diffusion-img2vid-xt/vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d92aa595a53d9da9faf594f09910ee869d5d567c8bb0362d5095673c69997d6
+size 391017740

pretrained_weights/xnemo_denoising_unet.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ff582dff6e19b08278378cfc244cf7203c6f70e3dcaba492ec39f9abb9be3d2
+size 4927016814

pretrained_weights/xnemo_motion_encoder.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0230c49cebff21fd81c14fc61fc509ab1120b61415d40571e7dc1b9df1fc6b6f
+size 246869630