XavierJiezou commited on Dec 17, 2024

Commit

921503d

verified ·

1 Parent(s): f8ca548

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +29 -7
README.md +16 -24
datasets/celeba.py +209 -0
datasets/cityscapes.py +303 -0
docs/gcdp.png +3 -0
environment.yaml +66 -0
example.py +25 -0
imagen_pytorch/__init__.py +26 -0
imagen_pytorch/cli.py +52 -0
imagen_pytorch/configs.py +181 -0
imagen_pytorch/data.py +73 -0
imagen_pytorch/elucidated_imagen.py +846 -0
imagen_pytorch/imagen_pytorch.py +2515 -0
imagen_pytorch/imagen_video/__init__.py +1 -0
imagen_pytorch/imagen_video/imagen_video.py +1662 -0
imagen_pytorch/joint_imagen.py +1942 -0
imagen_pytorch/t5.py +119 -0
imagen_pytorch/trainer.py +1782 -0
imagen_pytorch/utils.py +61 -0
imagen_pytorch/version.py +1 -0
pyproject.toml +3 -0
repaint/LICENSES/LICENSE +13 -0
repaint/LICENSES/LICENSE_guided_diffusion +21 -0
repaint/LICENSES/README.md +11 -0
repaint/README.md +205 -0
repaint/conf_mgt/__init__.py +18 -0
repaint/conf_mgt/conf_base.py +128 -0
repaint/confs/face_example.yml +87 -0
repaint/confs/test_c256_ev2li.yml +86 -0
repaint/confs/test_c256_ex64.yml +86 -0
repaint/confs/test_c256_genhalf.yml +86 -0
repaint/confs/test_c256_nn2.yml +86 -0
repaint/confs/test_c256_thick.yml +86 -0
repaint/confs/test_c256_thin.yml +86 -0
repaint/confs/test_inet256_ev2li.yml +87 -0
repaint/confs/test_inet256_ex64.yml +87 -0
repaint/confs/test_inet256_genhalf.yml +87 -0
repaint/confs/test_inet256_nn2.yml +87 -0
repaint/confs/test_inet256_thick.yml +87 -0
repaint/confs/test_inet256_thin.yml +87 -0
repaint/confs/test_p256_ev2li.yml +86 -0
repaint/confs/test_p256_ex64.yml +86 -0
repaint/confs/test_p256_genhalf.yml +86 -0
repaint/confs/test_p256_nn2.yml +86 -0
repaint/confs/test_p256_thick.yml +86 -0
repaint/confs/test_p256_thin.yml +86 -0
repaint/download.sh +19 -0
repaint/guided_diffusion/__init__.py +19 -0
repaint/guided_diffusion/dist_util.py +43 -0

.gitattributes CHANGED Viewed

@@ -47,3 +47,4 @@ tedigan/ext/experiment/inference_coupled/input_label.png filter=lfs diff=lfs mer
 tedigan/ext/experiment/inference_results/input_label.png filter=lfs diff=lfs merge=lfs -text
 uniteandconquer/utils/faces.png filter=lfs diff=lfs merge=lfs -text
 uniteandconquer/utils/natural.png filter=lfs diff=lfs merge=lfs -text

 tedigan/ext/experiment/inference_results/input_label.png filter=lfs diff=lfs merge=lfs -text
 uniteandconquer/utils/faces.png filter=lfs diff=lfs merge=lfs -text
 uniteandconquer/utils/natural.png filter=lfs diff=lfs merge=lfs -text
+docs/gcdp.png filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,3 +1,29 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -106,10 +132,8 @@ ipython_config.py
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
-.pdm-python
-.pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
@@ -161,7 +185,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-# push to github
-*.pt
-*.pth
-*.ckpt

+logs
+debug
+wandb_dir
+checkpoints
+squeue.txt
+results
+# Created by https://www.toptal.com/developers/gitignore/api/python,linux
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,linux
+### Linux ###
+*~
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
+#   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+# End of https://www.toptal.com/developers/gitignore/api/python,linux
+_baselines/datasetGAN/StyleGAN.pytorch/outputs/cityscapes/2023-03-05/log.txt

README.md CHANGED Viewed

@@ -1,24 +1,16 @@
-# MMFace
-## TODO
-- [ ] Diffusion-driven GAN Inversion Reproduction
-- [ ] Datasets Download
-## Installation
-## Datasets
-## Training
-## Evaluation
-## Methods
-- [x] [TediGAN (CVPR 2021)](https://github.com/IIGROUP/TediGAN)
-- [x] [UniteandConquer (CVPR 2023)](https://github.com/Nithin-GK/UniteandConquer)
-- [x] [Collaborative-Diffusion (CVPR 2023)](https://github.com/ziqihuangg/Collaborative-Diffusion)
-- [x] [GCDP (ICCV 2023)](https://github.com/pmh9960/GCDP) (Text2Image)
-- [x] [PixelFace+ (MM 2023)](https://github.com/qazwsx671713/PixelFace-Plus)
-- [ ] [Diffusion-driven GAN Inversion (CVPR 2024)](https://github.com/1211sh/Diffusion-driven_GAN-Inversion/)
-- [ ] [MM2Latent (ECCVW 2024)](https://github.com/Open-Debin/MM2Latent)

+# Evaluation
+```bash
+CUDA_VISIBLE_DEVICES=4 python test.py --model_type=base_128x128 \
+    --checkpoint_path checkpoints/celeba/base_128x128_flip_100/checkpoint.500000.pt \
+    --end_sample_idx=1 \
+    --test_batch_size=1 \
+    --dataset celeba \
+    --num_classes 19 \
+    --save_path=results/celeba/base.png \
+    --test_captions "The woman wears earrings. She has wavy hair. She is attractive."
+```
+```bash
+CUDA_VISIBLE_DEVICES=4 python test.py --conf_path confs/face_example.yml
+```

datasets/celeba.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import os.path as osp
+import random
+from collections import namedtuple
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision.transforms import Compose, InterpolationMode, RandomCrop, RandomHorizontalFlip, Resize, ToTensor
+CelebaClass = namedtuple('CelebaClass', ['name', 'id', 'color'])
+# autopep8: off
+classes = [
+    CelebaClass('background',  0, (  0,   0,   0)),
+    CelebaClass('skin',        1, (204,   0,   0)),
+    CelebaClass('nose',        2, ( 76, 153,   0)),
+    CelebaClass('eye_g',       3, (204, 204,   0)),
+    CelebaClass('l_eye',       4, ( 51,  51, 255)),
+    CelebaClass('r_eye',       5, (204,   0, 204)),
+    CelebaClass('l_brow',      6, (  0, 255, 255)),
+    CelebaClass('r_brow',      7, (255, 204, 204)),
+    CelebaClass('l_ear',       8, (102,  51,   0)),
+    CelebaClass('r_ear',       9, (255,   0,   0)),
+    CelebaClass('mouth',      10, (102, 204,   0)),
+    CelebaClass('u_lip',      11, (255, 255,   0)),
+    CelebaClass('l_lip',      12, (  0,   0, 153)),
+    CelebaClass('hair',       13, (  0,   0, 204)),
+    CelebaClass('hat',        14, (255,  51, 153)),
+    CelebaClass('ear_r',      15, (  0, 204, 204)),
+    CelebaClass('neck_l',     16, (  0,  51,   0)),
+    CelebaClass('neck',       17, (255, 153,  51)),
+    CelebaClass('cloth',      18, (  0, 204,   0)),
+]
+# autopep8: on
+num_classes = 19
+mapping_id = torch.tensor([x.id for x in classes])
+colors = torch.tensor([cls.color for cls in classes])
+def normalize_to_neg_one_to_one(img):
+    return img * 2 - 1
+def unnormalize_to_zero_to_one(img):
+    return (img + 1) * 0.5
+def unnormalize_and_clamp_to_zero_to_one(img):
+    return torch.clamp(unnormalize_to_zero_to_one(img.cpu()), 0, 1)
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+class ToTensorNoNorm():
+    def __call__(self, X_i):
+        X_i = np.array(X_i)
+        if len(X_i.shape) == 2:
+            # Add channel dim.
+            X_i = X_i[:, :, None]
+        return torch.from_numpy(np.array(X_i, copy=False)).permute(2, 0, 1)
+def interpolate_3d(x, *args, **kwargs):
+    return F.interpolate(x.unsqueeze(0), *args, **kwargs).squeeze(0)
+class RandomResize(nn.Module):
+    def __init__(self, scale=(0.5, 2.0), mode='nearest'):
+        super().__init__()
+        self.scale = scale
+        self.mode = mode
+    def get_random_scale(self):
+        return random.uniform(*self.scale)
+    def forward(self, x):
+        random_scale = self.get_random_scale()
+        x = interpolate_3d(x, scale_factor=random_scale, mode=self.mode)
+        return x
+def read_jsonl(jsonl_path):
+    import jsonlines
+    lines = []
+    with jsonlines.open(jsonl_path, 'r') as f:
+        for line in f.iter():
+            lines.append(line)
+    return lines
+class CelebaDataset(Dataset):
+    def __init__(
+        self,
+        root="",
+        split='train',
+        side_x=128,
+        side_y=128,
+        caption_list_dir='',
+        augmentation_type='flip',
+    ):
+        super().__init__()
+        self.root = Path(root)
+        self.image_dir = osp.join(self.root, 'CelebA-HQ-img')
+        self.label_dir = osp.join(self.root, 'CelebAMask-HQ-mask-anno', 'preprocessed')
+        self.split = split
+        self.side_x = side_x
+        self.side_y = side_y
+        self.caption_list_dir = caption_list_dir
+        captions_jsonl = read_jsonl(osp.join(self.caption_list_dir, f'{split}_captions.jsonl'))
+        self.caption_dict = {}
+        for caption_jsonl in captions_jsonl:
+            self.caption_dict[osp.splitext(caption_jsonl['file_name'])[0]] = caption_jsonl['text']
+        if augmentation_type == 'none':
+            self.augmentation = Compose([
+                Resize((side_x, side_y), interpolation=InterpolationMode.NEAREST),
+                # ToTensor(),
+            ])
+        elif augmentation_type == 'flip':
+            self.augmentation = Compose([
+                Resize((side_x, side_y), interpolation=InterpolationMode.NEAREST),
+                RandomHorizontalFlip(p=0.5),
+                # ToTensor(),
+            ])
+        elif 'resizedCrop' in augmentation_type:
+            scale = [float(s) for s in augmentation_type.split('_')[1:]]
+            assert len(scale) == 2, scale
+            self.augmentation = Compose([
+                RandomResize(scale=scale, mode='nearest'),
+                RandomCrop((1024, 1024)),
+                Resize((side_x, side_y), interpolation=InterpolationMode.NEAREST),
+                RandomHorizontalFlip(p=0.5),
+                # ToTensor(),
+            ])
+        else:
+            raise NotImplementedError(augmentation_type)
+        # verification
+        self.images = sorted([osp.join(self.image_dir, file) for file in os.listdir(self.image_dir)
+                              if osp.splitext(file)[0] in self.caption_dict.keys()])
+        self.labels = sorted([osp.join(self.label_dir, file) for file in os.listdir(self.label_dir)
+                              if osp.splitext(file)[0] in self.caption_dict.keys()])
+        assert len(self.images) == len(self.labels), f'{len(self.images)} != {len(self.labels)}'
+        for img, lbl in zip(self.images, self.labels):
+            assert osp.splitext(osp.basename(img))[0] == osp.splitext(osp.basename(lbl))[0]
+    def __len__(self):
+        return len(self.images)
+    def random_sample(self):
+        return self.__getitem__(random.randint(0, self.__len__() - 1))
+    def sequential_sample(self, ind):
+        if ind >= self.__len__() - 1:
+            return self.__getitem__(0)
+        return self.__getitem__(ind + 1)
+    def skip_sample(self, ind):
+        return self.sequential_sample(ind=ind)
+    def get_caption_list_objects(self, idx):
+        filename = osp.splitext(osp.basename(self.images[idx]))[0]
+        caption = random.choice(self.caption_dict[filename])
+        return caption
+    def __getitem__(self, idx):
+        # load image label
+        try:
+            original_pil_image = Image.open(self.images[idx]).convert("RGB")
+            original_pil_target = Image.open(self.labels[idx])
+        except (OSError, ValueError) as e:
+            print(f"An exception occurred trying to load file {self.images[idx]}.")
+            print(f"Skipping index {idx}")
+            return self.skip_sample(idx)
+        # Transforms
+        image = Resize((1024, 1024), InterpolationMode.NEAREST)(ToTensor()(original_pil_image))
+        label = Resize((1024, 1024), InterpolationMode.NEAREST)(ToTensorNoNorm()(original_pil_target).float())
+        img_lbl = self.augmentation(torch.cat([image, label]))
+        caption = self.get_caption_list_objects(idx)
+        return img_lbl[:3], img_lbl[3:], caption
+def transform_lbl(lbl: torch.Tensor, *args, **kwargs):
+    lbl = lbl.long()
+    if lbl.size(1) == 1:
+        # Remove single channel axis.
+        lbl = lbl[:, 0]
+    rgbs = colors[lbl]
+    rgbs = rgbs.permute(0, 3, 1, 2)
+    return rgbs / 255.

datasets/cityscapes.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import json
+import os
+import random
+from collections import namedtuple
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from cityscapesscripts.helpers.labels import trainId2label
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision.transforms import Compose, InterpolationMode, RandomCrop, RandomHorizontalFlip, Resize, ToTensor
+CityscapesClass = namedtuple('CityscapesClass', ['name', 'id', 'train_id', 'category', 'category_id',
+                                                 'has_instances', 'ignore_in_eval', 'color'])
+# autopep8: off
+classes = [
+    CityscapesClass('unlabeled',             0, 255, 'void',         0, False, True,  (  0,   0,   0)),
+    CityscapesClass('ego vehicle',           1, 255, 'void',         0, False, True,  (  0,   0,   0)),
+    CityscapesClass('rectification border',  2, 255, 'void',         0, False, True,  (  0,   0,   0)),
+    CityscapesClass('out of roi',            3, 255, 'void',         0, False, True,  (  0,   0,   0)),
+    CityscapesClass('static',                4, 255, 'void',         0, False, True,  (  0,   0,   0)),
+    CityscapesClass('dynamic',               5, 255, 'void',         0, False, True,  (111,  74,   0)),
+    CityscapesClass('ground',                6, 255, 'void',         0, False, True,  ( 81,   0,  81)),
+    CityscapesClass('road',                  7,   0, 'flat',         1, False, False, (128,  64, 128)),
+    CityscapesClass('sidewalk',              8,   1, 'flat',         1, False, False, (244,  35, 232)),
+    CityscapesClass('parking',               9, 255, 'flat',         1, False, True,  (250, 170, 160)),
+    CityscapesClass('rail track',           10, 255, 'flat',         1, False, True,  (230, 150, 140)),
+    CityscapesClass('building',             11,   2, 'construction', 2, False, False, ( 70,  70,  70)),
+    CityscapesClass('wall',                 12,   3, 'construction', 2, False, False, (102, 102, 156)),
+    CityscapesClass('fence',                13,   4, 'construction', 2, False, False, (190, 153, 153)),
+    CityscapesClass('guard rail',           14, 255, 'construction', 2, False, True,  (180, 165, 180)),
+    CityscapesClass('bridge',               15, 255, 'construction', 2, False, True,  (150, 100, 100)),
+    CityscapesClass('tunnel',               16, 255, 'construction', 2, False, True,  (150, 120,  90)),
+    CityscapesClass('pole',                 17,   5, 'object',       3, False, False, (153, 153, 153)),
+    CityscapesClass('polegroup',            18, 255, 'object',       3, False, True,  (153, 153, 153)),
+    CityscapesClass('traffic light',        19,   6, 'object',       3, False, False, (250, 170,  30)),
+    CityscapesClass('traffic sign',         20,   7, 'object',       3, False, False, (220, 220,   0)),
+    CityscapesClass('vegetation',           21,   8, 'nature',       4, False, False, (107, 142,  35)),
+    CityscapesClass('terrain',              22,   9, 'nature',       4, False, False, (152, 251, 152)),
+    CityscapesClass('sky',                  23,  10, 'sky',          5, False, False, ( 70, 130, 180)),
+    CityscapesClass('person',               24,  11, 'human',        6, True,  False, (220,  20,  60)),
+    CityscapesClass('rider',                25,  12, 'human',        6, True,  False, (255,   0,   0)),
+    CityscapesClass('car',                  26,  13, 'vehicle',      7, True,  False, (  0,   0, 142)),
+    CityscapesClass('truck',                27,  14, 'vehicle',      7, True,  False, (  0,   0,  70)),
+    CityscapesClass('bus',                  28,  15, 'vehicle',      7, True,  False, (  0,  60, 100)),
+    CityscapesClass('caravan',              29, 255, 'vehicle',      7, True,  True,  (  0,   0,  90)),
+    CityscapesClass('trailer',              30, 255, 'vehicle',      7, True,  True,  (  0,   0, 110)),
+    CityscapesClass('train',                31,  16, 'vehicle',      7, True,  False, (  0,  80, 100)),
+    CityscapesClass('motorcycle',           32,  17, 'vehicle',      7, True,  False, (  0,   0, 230)),
+    CityscapesClass('bicycle',              33,  18, 'vehicle',      7, True,  False, (119,  11,  32)),
+    CityscapesClass('license plate',        -1,  -1, 'vehicle',      7, False, True,  (  0,   0, 142)),
+]
+# autopep8: on
+map_id_to_id = torch.tensor([x.id for x in classes])
+map_id_to_category_id = torch.tensor([x.category_id for x in classes])
+map_id_to_train_id = torch.tensor([x.train_id for x in classes])
+id_type_to_classes = dict(
+    id=dict(num_classes=34,
+            map_fn=torch.tensor([x if x not in (-1, ) else 0 for x in map_id_to_id]),
+            names=[cls.name for cls in classes][:-1]),
+    category_id=dict(num_classes=8,
+                     map_fn=map_id_to_category_id,
+                     names=[cls.name for cls in classes][:-1]),  # TODO it is wrong
+    train_id=dict(num_classes=20,
+                  map_fn=torch.tensor([x if x not in (-1, 255) else 19 for x in map_id_to_train_id]),
+                  names=[i.name for i in classes if i.train_id != 255][:-1] + ['unlabeled']),
+)
+def normalize_to_neg_one_to_one(img):
+    return img * 2 - 1
+def unnormalize_to_zero_to_one(img):
+    return (img + 1) * 0.5
+def unnormalize_and_clamp_to_zero_to_one(img):
+    return torch.clamp(unnormalize_to_zero_to_one(img.cpu()), 0, 1)
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+class ToTensorNoNorm():
+    def __call__(self, X_i):
+        X_i = np.array(X_i)
+        if len(X_i.shape) == 2:
+            # Add channel dim.
+            X_i = X_i[:, :, None]
+        return torch.from_numpy(np.array(X_i, copy=False)).permute(2, 0, 1)
+def interpolate_3d(x, *args, **kwargs):
+    return F.interpolate(x.unsqueeze(0), *args, **kwargs).squeeze(0)
+class RandomResize(nn.Module):
+    def __init__(self, scale=(0.5, 2.0), mode='nearest'):
+        super().__init__()
+        self.scale = scale
+        self.mode = mode
+    def get_random_scale(self):
+        return random.uniform(*self.scale)
+    def forward(self, x):
+        random_scale = self.get_random_scale()
+        x = interpolate_3d(x, scale_factor=random_scale, mode=self.mode)
+        return x
+def read_jsonl(jsonl_path):
+    import jsonlines
+    lines = []
+    with jsonlines.open(jsonl_path, 'r') as f:
+        for line in f.iter():
+            lines.append(line)
+    return lines
+class CityscapesDataset(Dataset):
+    def __init__(
+        self,
+        root="",
+        split='train',
+        side_x=64,
+        side_y=64,
+        shuffle=False,
+        caption_list_dir='',
+        id_type='train_id',
+        augmentation_type='flip',
+    ):
+        super().__init__()
+        self.root = Path(root)
+        self.image_dir = os.path.join(self.root, 'leftImg8bit')
+        self.label_dir = os.path.join(self.root, 'gtFine')
+        self.split = split
+        self.metadata = read_jsonl(os.path.join(caption_list_dir, f'{split}_captions.jsonl'))
+        self.metadata = sorted(self.metadata, key=lambda line: line['file_name'])
+        assert id_type == 'train_id'
+        self.map_fn = id_type_to_classes[id_type]['map_fn']
+        self.class_names = id_type_to_classes[id_type]['names']
+        self.num_classes = id_type_to_classes[id_type]['num_classes']
+        # self.text_ctx_len = text_ctx_len
+        self.shuffle = shuffle
+        self.side_x = side_x
+        self.side_y = side_y
+        if augmentation_type == 'none':
+            self.augmentation = Compose([
+                Resize((side_x, side_y), interpolation=InterpolationMode.NEAREST),
+                # ToTensor(),
+            ])
+        elif augmentation_type == 'flip':
+            self.augmentation = Compose([
+                Resize((side_x, side_y), interpolation=InterpolationMode.NEAREST),
+                RandomHorizontalFlip(p=0.5),
+                # ToTensor(),
+            ])
+        elif 'resizedCrop' in augmentation_type:
+            scale = [float(s) for s in augmentation_type.split('_')[1:]]
+            assert len(scale) == 2, scale
+            self.augmentation = Compose([
+                RandomResize(scale=scale, mode='nearest'),
+                RandomCrop((1024, 2048)),
+                Resize((side_x, side_y), interpolation=InterpolationMode.NEAREST),
+                RandomHorizontalFlip(p=0.5),
+                # ToTensor(),
+            ])
+        else:
+            raise NotImplementedError(augmentation_type)
+        # filenames of images and labels
+        self.images = []
+        self.labels = []
+        for line in self.metadata:
+            cityname = line['file_name'].split('_')[0]
+            split = 'val' if cityname in ['frankfurt', 'lindau', 'munster'] else 'train'
+            img_dir = os.path.join(self.image_dir, split, cityname, line['file_name'])
+            lbl_dir = os.path.join(self.label_dir, split, cityname,
+                                   line['file_name'].replace('leftImg8bit.png', 'gtFine_labelIds.png'))
+            assert os.path.isfile(img_dir), img_dir
+            assert os.path.isfile(lbl_dir), lbl_dir
+            self.images.append(img_dir)
+            self.labels.append(lbl_dir)
+    def __len__(self):
+        return len(self.images)
+    def random_sample(self):
+        return self.__getitem__(random.randint(0, self.__len__() - 1))
+    def sequential_sample(self, ind):
+        if ind >= self.__len__() - 1:
+            return self.__getitem__(0)
+        return self.__getitem__(ind + 1)
+    def skip_sample(self, ind):
+        if self.shuffle:
+            return self.random_sample()
+        return self.sequential_sample(ind=ind)
+    def get_caption_list_objects(self, idx):
+        caption = random.choice(self.metadata[idx]['text'])
+        return caption
+    def _load_json(self, path):
+        with open(path, 'r') as file:
+            data = json.load(file)
+        return data
+    def __getitem__(self, idx):
+        # load image
+        try:
+            original_pil_image = Image.open(self.images[idx]).convert("RGB")
+            original_pil_target = Image.open(self.labels[idx])
+        except (OSError, ValueError) as e:
+            print(f"An exception occurred trying to load file {self.images[idx]}.")
+            print(f"Skipping index {idx}")
+            return self.skip_sample(idx)
+        # Transforms
+        image = ToTensor()(original_pil_image)
+        label = ToTensorNoNorm()(original_pil_target)
+        label = self.map_fn[label.long()]
+        img_lbl = self.augmentation(torch.cat([image, label]))
+        caption = self.get_caption_list_objects(idx)
+        return img_lbl[:3], img_lbl[3:], caption
+def indices_segmentation_to_img(indices, colors):
+    if indices.size(1) == 1:
+        # Remove single channel axis.
+        indices = indices[:, 0]
+    # for train_id
+    indices = indices * (indices != 255) + torch.ones_like(indices) * 19 * (indices == 255)
+    rgbs = colors[indices]
+    rgbs = rgbs.permute(0, 3, 1, 2)
+    return rgbs / 255.
+def get_colors_from_id_type(id_type):
+    num_classes = len(id_type_to_classes[id_type]['map_fn'].unique())
+    colors = torch.zeros((num_classes, 3))
+    exist_ids = []
+    for idx, cls in enumerate(id_type_to_classes[id_type]['map_fn']):
+        if cls == 255:
+            cls = 19
+        if cls not in exist_ids:
+            colors[cls] = torch.tensor(classes[idx].color)
+            exist_ids.append(cls)
+    return colors
+def transform_lbl(lbl, id_type='id'):
+    colors = get_colors_from_id_type(id_type)
+    return indices_segmentation_to_img(lbl, colors)
+def transform_img_lbl(x, id_type='id', unnorm=True):
+    colors = get_colors_from_id_type(id_type)
+    x = x.detach().cpu()
+    x = x.unsqueeze(0) if x.dim() == 3 else x
+    # b, _, h, w = x.shape
+    img = x[:, :3]
+    lbl = x[:, 3:].long()
+    img = unnormalize_to_zero_to_one(img) if unnorm else img
+    saved_img = torch.cat([img, indices_segmentation_to_img(lbl, colors)])  # b * 2, 3, h ,w
+    return saved_img
+def trainId2label_fn(train_id_map):
+    saved_label_id = torch.zeros_like(train_id_map)
+    for t_id, label in trainId2label.items():
+        if label.ignoreInEval:
+            continue
+        saved_label_id[train_id_map == t_id] = label.id
+    return saved_label_id
+def change_19_to_255(id_map):
+    id_map[id_map == 19] = 255
+    return id_map

docs/gcdp.png ADDED Viewed

Git LFS Details

SHA256: c6a2ae1b22b793b7746bd8cf888e3273c195624320a2b411def38402dc174e52
Pointer size: 132 Bytes
Size of remote file: 3.99 MB

environment.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+channels:
+- pytorch
+- conda-forge
+- defaults
+dependencies:
+- cudatoolkit=11.3.1
+- python=3.9.17
+- pytorch=1.10.1
+- torchvision=0.11.2
+- pip:
+  - accelerate==0.21.0
+  - annotated-types==0.5.0
+  - appdirs==1.4.4
+  - attrs==23.1.0
+  - autopep8==2.0.2
+  - certifi==2023.7.22
+  - charset-normalizer==3.2.0
+  - cityscapesscripts==2.2.2
+  - click==8.1.6
+  - coloredlogs==15.0.1
+  - contourpy==1.1.0
+  - cycler==0.11.0
+  - docker-pycreds==0.4.0
+  - einops==0.6.1
+  - einops-exts==0.0.4
+  - ema-pytorch==0.2.3
+  - filelock==3.12.2
+  - fonttools==4.41.1
+  - fsspec==2023.6.0
+  - gitdb==4.0.10
+  - gitpython==3.1.32
+  - huggingface-hub==0.16.4
+  - humanfriendly==10.0
+  - idna==3.4
+  - importlib-resources==6.0.0
+  - jsonlines==3.1.0
+  - kiwisolver==1.4.4
+  - kornia==0.6.12
+  - matplotlib==3.7.2
+  - packaging==23.1
+  - pathtools==0.1.2
+  - protobuf==4.23.4
+  - psutil==5.9.5
+  - pycodestyle==2.11.0
+  - pydantic==2.1.1
+  - pydantic-core==2.4.0
+  - pyparsing==3.0.9
+  - pyquaternion==0.9.9
+  - python-dateutil==2.8.2
+  - pytorch-warmup==0.1.1
+  - pyyaml==6.0.1
+  - regex==2023.6.3
+  - requests==2.31.0
+  - safetensors==0.3.1
+  - sentencepiece==0.1.99
+  - sentry-sdk==1.28.1
+  - setproctitle==1.3.2
+  - smmap==5.0.0
+  - tokenizers==0.13.3
+  - tomli==2.0.1
+  - tqdm==4.65.0
+  - transformers==4.31.0
+  - typing==3.7.4.3
+  - urllib3==2.0.4
+  - wandb==0.15.7
+  - zipp==3.16.2

example.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from PIL import Image
+import numpy as np
+# Load the image
+image_path = '/home/zouxuechao/mmface/gcdp/repaint/data/datasets/gt_keep_masks/gcdp/base_0_0.png'
+image = Image.open(image_path)
+# Convert the image to a numpy array
+image_array = np.array(image)
+# Check if the image is 256x128 with 3 channels (RGB)
+if image_array.shape == (128, 256, 3):
+    # Modify the left 128x128 section to be 0 (black) for all 3 channels
+    image_array[:, :128, :] = 0
+    # Modify the right 128x128 section to be 255 (white) for all 3 channels
+    image_array[:, 128:, :] = 255
+    # Convert back to an image
+    modified_image = Image.fromarray(image_array)
+    # Save the modified image
+    modified_image.save('modified_image.png')
+else:
+    print("The image does not have the required size (256x128 with 3 channels).")

imagen_pytorch/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from imagen_pytorch.imagen_pytorch import Imagen, Unet
+from imagen_pytorch.imagen_pytorch import NullUnet
+from imagen_pytorch.imagen_pytorch import BaseUnet64, SRUnet256, SRUnet1024
+from imagen_pytorch.trainer import ImagenTrainer
+from imagen_pytorch.version import __version__
+# imagen using the elucidated ddpm from Tero Karras' new paper
+from imagen_pytorch.elucidated_imagen import ElucidatedImagen
+# config driven creation of imagen instances
+from imagen_pytorch.configs import UnetConfig, ImagenConfig, ElucidatedImagenConfig, ImagenTrainerConfig
+# utils
+from imagen_pytorch.utils import load_imagen_from_checkpoint
+# video
+from imagen_pytorch.imagen_video import Unet3D
+# joint
+from imagen_pytorch.joint_imagen import BaseJointUnet, JointImagen, SRJointUnet
+from imagen_pytorch.trainer import ImagenTrainer, JointImagenTrainer

imagen_pytorch/cli.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import click
+import torch
+from pathlib import Path
+from imagen_pytorch import load_imagen_from_checkpoint
+from imagen_pytorch.version import __version__
+from imagen_pytorch.utils import safeget
+def exists(val):
+    return val is not None
+def simple_slugify(text, max_length = 255):
+    return text.replace("-", "_").replace(",", "").replace(" ", "_").replace("|", "--").strip('-_')[:max_length]
+def main():
+    pass
+@click.command()
+@click.option('--model', default = './imagen.pt', help = 'path to trained Imagen model')
+@click.option('--cond_scale', default = 5, help = 'conditioning scale (classifier free guidance) in decoder')
+@click.option('--load_ema', default = True, help = 'load EMA version of unets if available')
+@click.argument('text')
+def imagen(
+    model,
+    cond_scale,
+    load_ema,
+    text
+):
+    model_path = Path(model)
+    full_model_path = str(model_path.resolve())
+    assert model_path.exists(), f'model not found at {full_model_path}'
+    loaded = torch.load(str(model_path))
+    # get version
+    version = safeget(loaded, 'version')
+    print(f'loading Imagen from {full_model_path}, saved at version {version} - current package version is {__version__}')
+    # get imagen parameters and type
+    imagen = load_imagen_from_checkpoint(str(model_path), load_ema_if_available = load_ema)
+    imagen.cuda()
+    # generate image
+    pil_image = imagen.sample(text, cond_scale = cond_scale, return_pil_images = True)
+    image_path = f'./{simple_slugify(text)}.png'
+    pil_image[0].save(image_path)
+    print(f'image saved to {str(image_path)}')
+    return

imagen_pytorch/configs.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import json
+from pydantic import BaseModel, validator, root_validator
+from typing import List, Iterable, Optional, Union, Tuple, Dict, Any
+from enum import Enum
+from imagen_pytorch.imagen_pytorch import Imagen, Unet, Unet3D, NullUnet
+from imagen_pytorch.trainer import ImagenTrainer
+from imagen_pytorch.elucidated_imagen import ElucidatedImagen
+from imagen_pytorch.t5 import DEFAULT_T5_NAME, get_encoded_dim
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def ListOrTuple(inner_type):
+    return Union[List[inner_type], Tuple[inner_type]]
+def SingleOrList(inner_type):
+    return Union[inner_type, ListOrTuple(inner_type)]
+# noise schedule
+class NoiseSchedule(Enum):
+    cosine = 'cosine'
+    linear = 'linear'
+class AllowExtraBaseModel(BaseModel):
+    class Config:
+        extra = "allow"
+        use_enum_values = True
+# imagen pydantic classes
+class NullUnetConfig(BaseModel):
+    is_null:            bool
+    def create(self):
+        return NullUnet()
+class UnetConfig(AllowExtraBaseModel):
+    dim:                int
+    dim_mults:          ListOrTuple(int)
+    text_embed_dim:     int = get_encoded_dim(DEFAULT_T5_NAME)
+    cond_dim:           int = None
+    channels:           int = 3
+    attn_dim_head:      int = 32
+    attn_heads:         int = 16
+    def create(self):
+        return Unet(**self.dict())
+class Unet3DConfig(AllowExtraBaseModel):
+    dim:                int
+    dim_mults:          ListOrTuple(int)
+    text_embed_dim:     int = get_encoded_dim(DEFAULT_T5_NAME)
+    cond_dim:           int = None
+    channels:           int = 3
+    attn_dim_head:      int = 32
+    attn_heads:         int = 16
+    def create(self):
+        return Unet3D(**self.dict())
+class ImagenConfig(AllowExtraBaseModel):
+    unets:                  ListOrTuple(Union[UnetConfig, Unet3DConfig, NullUnetConfig])
+    image_sizes:            ListOrTuple(int)
+    video:                  bool = False
+    timesteps:              SingleOrList(int) = 1000
+    noise_schedules:        SingleOrList(NoiseSchedule) = 'cosine'
+    text_encoder_name:      str = DEFAULT_T5_NAME
+    channels:               int = 3
+    loss_type:              str = 'l2'
+    cond_drop_prob:         float = 0.5
+    @validator('image_sizes')
+    def check_image_sizes(cls, image_sizes, values):
+        unets = values.get('unets')
+        if len(image_sizes) != len(unets):
+            raise ValueError(f'image sizes length {len(image_sizes)} must be equivalent to the number of unets {len(unets)}')
+        return image_sizes
+    def create(self):
+        decoder_kwargs = self.dict()
+        unets_kwargs = decoder_kwargs.pop('unets')
+        is_video = decoder_kwargs.pop('video', False)
+        unets = []
+        for unet, unet_kwargs in zip(self.unets, unets_kwargs):
+            if isinstance(unet, NullUnetConfig):
+                unet_klass = NullUnet
+            elif is_video:
+                unet_klass = Unet3D
+            else:
+                unet_klass = Unet
+            unets.append(unet_klass(**unet_kwargs))
+        imagen = Imagen(unets, **decoder_kwargs)
+        imagen._config = self.dict().copy()
+        return imagen
+class ElucidatedImagenConfig(AllowExtraBaseModel):
+    unets:                  ListOrTuple(Union[UnetConfig, Unet3DConfig, NullUnetConfig])
+    image_sizes:            ListOrTuple(int)
+    video:                  bool = False
+    text_encoder_name:      str = DEFAULT_T5_NAME
+    channels:               int = 3
+    cond_drop_prob:         float = 0.5
+    num_sample_steps:       SingleOrList(int) = 32
+    sigma_min:              SingleOrList(float) = 0.002
+    sigma_max:              SingleOrList(int) = 80
+    sigma_data:             SingleOrList(float) = 0.5
+    rho:                    SingleOrList(int) = 7
+    P_mean:                 SingleOrList(float) = -1.2
+    P_std:                  SingleOrList(float) = 1.2
+    S_churn:                SingleOrList(int) = 80
+    S_tmin:                 SingleOrList(float) = 0.05
+    S_tmax:                 SingleOrList(int) = 50
+    S_noise:                SingleOrList(float) = 1.003
+    @validator('image_sizes')
+    def check_image_sizes(cls, image_sizes, values):
+        unets = values.get('unets')
+        if len(image_sizes) != len(unets):
+            raise ValueError(f'image sizes length {len(image_sizes)} must be equivalent to the number of unets {len(unets)}')
+        return image_sizes
+    def create(self):
+        decoder_kwargs = self.dict()
+        unets_kwargs = decoder_kwargs.pop('unets')
+        is_video = decoder_kwargs.pop('video', False)
+        unet_klass = Unet3D if is_video else Unet
+        unets = []
+        for unet, unet_kwargs in zip(self.unets, unets_kwargs):
+            if isinstance(unet, NullUnetConfig):
+                unet_klass = NullUnet
+            elif is_video:
+                unet_klass = Unet3D
+            else:
+                unet_klass = Unet
+            unets.append(unet_klass(**unet_kwargs))
+        imagen = ElucidatedImagen(unets, **decoder_kwargs)
+        imagen._config = self.dict().copy()
+        return imagen
+class ImagenTrainerConfig(AllowExtraBaseModel):
+    imagen:                 dict
+    elucidated:             bool = False
+    video:                  bool = False
+    use_ema:                bool = True
+    lr:                     SingleOrList(float) = 1e-4
+    eps:                    SingleOrList(float) = 1e-8
+    beta1:                  float = 0.9
+    beta2:                  float = 0.99
+    max_grad_norm:          Optional[float] = None
+    group_wd_params:        bool = True
+    warmup_steps:           SingleOrList(Optional[int]) = None
+    cosine_decay_max_steps: SingleOrList(Optional[int]) = None
+    def create(self):
+        trainer_kwargs = self.dict()
+        imagen_config = trainer_kwargs.pop('imagen')
+        elucidated = trainer_kwargs.pop('elucidated')
+        imagen_config_klass = ElucidatedImagenConfig if elucidated else ImagenConfig
+        imagen = imagen_config_klass(**{**imagen_config, 'video': video}).create()
+        return ImagenTrainer(imagen, **trainer_kwargs)

imagen_pytorch/data.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from pathlib import Path
+from functools import partial
+import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms as T, utils
+from PIL import Image
+# helpers functions
+def exists(val):
+    return val is not None
+def cycle(dl):
+    while True:
+        for data in dl:
+            yield data
+def convert_image_to(img_type, image):
+    if image.mode != img_type:
+        return image.convert(img_type)
+    return image
+# dataset and dataloader
+class Dataset(Dataset):
+    def __init__(
+        self,
+        folder,
+        image_size,
+        exts = ['jpg', 'jpeg', 'png', 'tiff'],
+        convert_image_to_type = None
+    ):
+        super().__init__()
+        self.folder = folder
+        self.image_size = image_size
+        self.paths = [p for ext in exts for p in Path(f'{folder}').glob(f'**/*.{ext}')]
+        convert_fn = partial(convert_image_to, convert_image_to_type) if exists(convert_image_to_type) else nn.Identity()
+        self.transform = T.Compose([
+            T.Lambda(convert_fn),
+            T.Resize(image_size),
+            T.RandomHorizontalFlip(),
+            T.CenterCrop(image_size),
+            T.ToTensor()
+        ])
+    def __len__(self):
+        return len(self.paths)
+    def __getitem__(self, index):
+        path = self.paths[index]
+        img = Image.open(path)
+        return self.transform(img)
+def get_images_dataloader(
+    folder,
+    *,
+    batch_size,
+    image_size,
+    shuffle = True,
+    cycle_dl = False,
+    pin_memory = True
+):
+    ds = Dataset(folder, image_size)
+    dl = DataLoader(ds, batch_size = batch_size, shuffle = shuffle, pin_memory = pin_memory)
+    if cycle_dl:
+        dl = cycle(dl)
+    return dl

imagen_pytorch/elucidated_imagen.py ADDED Viewed

	@@ -0,0 +1,846 @@

+from math import sqrt
+from random import random
+from functools import partial
+from contextlib import contextmanager, nullcontext
+from typing import List, Union
+from collections import namedtuple
+from tqdm.auto import tqdm
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+from torch.cuda.amp import autocast
+from torch.nn.parallel import DistributedDataParallel
+import torchvision.transforms as T
+import kornia.augmentation as K
+from einops import rearrange, repeat, reduce
+from einops_exts import rearrange_many
+from imagen_pytorch.imagen_pytorch import (
+    GaussianDiffusionContinuousTimes,
+    Unet,
+    NullUnet,
+    first,
+    exists,
+    identity,
+    maybe,
+    default,
+    cast_tuple,
+    cast_uint8_images_to_float,
+    is_float_dtype,
+    eval_decorator,
+    check_shape,
+    pad_tuple_to_length,
+    resize_image_to,
+    right_pad_dims_to,
+    module_device,
+    normalize_neg_one_to_one,
+    unnormalize_zero_to_one,
+)
+from imagen_pytorch.imagen_video.imagen_video import (
+    Unet3D,
+    resize_video_to
+)
+from imagen_pytorch.t5 import t5_encode_text, get_encoded_dim, DEFAULT_T5_NAME
+# constants
+Hparams_fields = [
+    'num_sample_steps',
+    'sigma_min',
+    'sigma_max',
+    'sigma_data',
+    'rho',
+    'P_mean',
+    'P_std',
+    'S_churn',
+    'S_tmin',
+    'S_tmax',
+    'S_noise'
+]
+Hparams = namedtuple('Hparams', Hparams_fields)
+# helper functions
+def log(t, eps = 1e-20):
+    return torch.log(t.clamp(min = eps))
+# main class
+class ElucidatedImagen(nn.Module):
+    def __init__(
+        self,
+        unets,
+        *,
+        image_sizes,                                # for cascading ddpm, image size at each stage
+        text_encoder_name = DEFAULT_T5_NAME,
+        text_embed_dim = None,
+        channels = 3,
+        cond_drop_prob = 0.1,
+        random_crop_sizes = None,
+        lowres_sample_noise_level = 0.2,            # in the paper, they present a new trick where they noise the lowres conditioning image, and at sample time, fix it to a certain level (0.1 or 0.3) - the unets are also made to be conditioned on this noise level
+        per_sample_random_aug_noise_level = False,  # unclear when conditioning on augmentation noise level, whether each batch element receives a random aug noise value - turning off due to @marunine's find
+        condition_on_text = True,
+        auto_normalize_img = True,                  # whether to take care of normalizing the image from [0, 1] to [-1, 1] and back automatically - you can turn this off if you want to pass in the [-1, 1] ranged image yourself from the dataloader
+        dynamic_thresholding = True,
+        dynamic_thresholding_percentile = 0.95,     # unsure what this was based on perusal of paper
+        only_train_unet_number = None,
+        lowres_noise_schedule = 'linear',
+        num_sample_steps = 32,                      # number of sampling steps
+        sigma_min = 0.002,                          # min noise level
+        sigma_max = 80,                             # max noise level
+        sigma_data = 0.5,                           # standard deviation of data distribution
+        rho = 7,                                    # controls the sampling schedule
+        P_mean = -1.2,                              # mean of log-normal distribution from which noise is drawn for training
+        P_std = 1.2,                                # standard deviation of log-normal distribution from which noise is drawn for training
+        S_churn = 80,                               # parameters for stochastic sampling - depends on dataset, Table 5 in apper
+        S_tmin = 0.05,
+        S_tmax = 50,
+        S_noise = 1.003,
+    ):
+        super().__init__()
+        self.only_train_unet_number = only_train_unet_number
+        # conditioning hparams
+        self.condition_on_text = condition_on_text
+        self.unconditional = not condition_on_text
+        # channels
+        self.channels = channels
+        # automatically take care of ensuring that first unet is unconditional
+        # while the rest of the unets are conditioned on the low resolution image produced by previous unet
+        unets = cast_tuple(unets)
+        num_unets = len(unets)
+        # randomly cropping for upsampler training
+        self.random_crop_sizes = cast_tuple(random_crop_sizes, num_unets)
+        assert not exists(first(self.random_crop_sizes)), 'you should not need to randomly crop image during training for base unet, only for upsamplers - so pass in `random_crop_sizes = (None, 128, 256)` as example'
+        # lowres augmentation noise schedule
+        self.lowres_noise_schedule = GaussianDiffusionContinuousTimes(noise_schedule = lowres_noise_schedule)
+        # get text encoder
+        self.text_encoder_name = text_encoder_name
+        self.text_embed_dim = default(text_embed_dim, lambda: get_encoded_dim(text_encoder_name))
+        self.encode_text = partial(t5_encode_text, name = text_encoder_name)
+        # construct unets
+        self.unets = nn.ModuleList([])
+        self.unet_being_trained_index = -1 # keeps track of which unet is being trained at the moment
+        for ind, one_unet in enumerate(unets):
+            assert isinstance(one_unet, (Unet, Unet3D, NullUnet))
+            is_first = ind == 0
+            one_unet = one_unet.cast_model_parameters(
+                lowres_cond = not is_first,
+                cond_on_text = self.condition_on_text,
+                text_embed_dim = self.text_embed_dim if self.condition_on_text else None,
+                channels = self.channels,
+                channels_out = self.channels
+            )
+            self.unets.append(one_unet)
+        # determine whether we are training on images or video
+        is_video = any([isinstance(unet, Unet3D) for unet in self.unets])
+        self.is_video = is_video
+        self.right_pad_dims_to_datatype = partial(rearrange, pattern = ('b -> b 1 1 1' if not is_video else 'b -> b 1 1 1 1'))
+        self.resize_to = resize_video_to if is_video else resize_image_to
+        # unet image sizes
+        self.image_sizes = cast_tuple(self.image_sizes)
+        assert num_unets == len(self.image_sizes), f'you did not supply the correct number of u-nets ({len(self.unets)}) for resolutions {self.image_sizes}'
+        self.sample_channels = cast_tuple(self.channels, num_unets)
+        # cascading ddpm related stuff
+        lowres_conditions = tuple(map(lambda t: t.lowres_cond, self.unets))
+        assert lowres_conditions == (False, *((True,) * (num_unets - 1))), 'the first unet must be unconditioned (by low resolution image), and the rest of the unets must have `lowres_cond` set to True'
+        self.lowres_sample_noise_level = lowres_sample_noise_level
+        self.per_sample_random_aug_noise_level = per_sample_random_aug_noise_level
+        # classifier free guidance
+        self.cond_drop_prob = cond_drop_prob
+        self.can_classifier_guidance = cond_drop_prob > 0.
+        # normalize and unnormalize image functions
+        self.normalize_img = normalize_neg_one_to_one if auto_normalize_img else identity
+        self.unnormalize_img = unnormalize_zero_to_one if auto_normalize_img else identity
+        self.input_image_range = (0. if auto_normalize_img else -1., 1.)
+        # dynamic thresholding
+        self.dynamic_thresholding = cast_tuple(dynamic_thresholding, num_unets)
+        self.dynamic_thresholding_percentile = dynamic_thresholding_percentile
+        # elucidating parameters
+        hparams = [
+            num_sample_steps,
+            sigma_min,
+            sigma_max,
+            sigma_data,
+            rho,
+            P_mean,
+            P_std,
+            S_churn,
+            S_tmin,
+            S_tmax,
+            S_noise,
+        ]
+        hparams = [cast_tuple(hp, num_unets) for hp in hparams]
+        self.hparams = [Hparams(*unet_hp) for unet_hp in zip(*hparams)]
+        # one temp parameter for keeping track of device
+        self.register_buffer('_temp', torch.tensor([0.]), persistent = False)
+        # default to device of unets passed in
+        self.to(next(self.unets.parameters()).device)
+    def force_unconditional_(self):
+        self.condition_on_text = False
+        self.unconditional = True
+        for unet in self.unets:
+            unet.cond_on_text = False
+    @property
+    def device(self):
+        return self._temp.device
+    def get_unet(self, unet_number):
+        assert 0 < unet_number <= len(self.unets)
+        index = unet_number - 1
+        if isinstance(self.unets, nn.ModuleList):
+            unets_list = [unet for unet in self.unets]
+            delattr(self, 'unets')
+            self.unets = unets_list
+        if index != self.unet_being_trained_index:
+            for unet_index, unet in enumerate(self.unets):
+                unet.to(self.device if unet_index == index else 'cpu')
+        self.unet_being_trained_index = index
+        return self.unets[index]
+    def reset_unets_all_one_device(self, device = None):
+        device = default(device, self.device)
+        self.unets = nn.ModuleList([*self.unets])
+        self.unets.to(device)
+        self.unet_being_trained_index = -1
+    @contextmanager
+    def one_unet_in_gpu(self, unet_number = None, unet = None):
+        assert exists(unet_number) ^ exists(unet)
+        if exists(unet_number):
+            unet = self.unets[unet_number - 1]
+        devices = [module_device(unet) for unet in self.unets]
+        self.unets.cpu()
+        unet.to(self.device)
+        yield
+        for unet, device in zip(self.unets, devices):
+            unet.to(device)
+    # overriding state dict functions
+    def state_dict(self, *args, **kwargs):
+        self.reset_unets_all_one_device()
+        return super().state_dict(*args, **kwargs)
+    def load_state_dict(self, *args, **kwargs):
+        self.reset_unets_all_one_device()
+        return super().load_state_dict(*args, **kwargs)
+    # dynamic thresholding
+    def threshold_x_start(self, x_start, dynamic_threshold = True):
+        if not dynamic_threshold:
+            return x_start.clamp(-1., 1.)
+        s = torch.quantile(
+            rearrange(x_start, 'b ... -> b (...)').abs(),
+            self.dynamic_thresholding_percentile,
+            dim = -1
+        )
+        s.clamp_(min = 1.)
+        s = right_pad_dims_to(x_start, s)
+        return x_start.clamp(-s, s) / s
+    # derived preconditioning params - Table 1
+    def c_skip(self, sigma_data, sigma):
+        return (sigma_data ** 2) / (sigma ** 2 + sigma_data ** 2)
+    def c_out(self, sigma_data, sigma):
+        return sigma * sigma_data * (sigma_data ** 2 + sigma ** 2) ** -0.5
+    def c_in(self, sigma_data, sigma):
+        return 1 * (sigma ** 2 + sigma_data ** 2) ** -0.5
+    def c_noise(self, sigma):
+        return log(sigma) * 0.25
+    # preconditioned network output
+    # equation (7) in the paper
+    def preconditioned_network_forward(
+        self,
+        unet_forward,
+        noised_images,
+        sigma,
+        *,
+        sigma_data,
+        clamp = False,
+        dynamic_threshold = True,
+        **kwargs
+    ):
+        batch, device = noised_images.shape[0], noised_images.device
+        if isinstance(sigma, float):
+            sigma = torch.full((batch,), sigma, device = device)
+        padded_sigma = self.right_pad_dims_to_datatype(sigma)
+        net_out = unet_forward(
+            self.c_in(sigma_data, padded_sigma) * noised_images,
+            self.c_noise(sigma),
+            **kwargs
+        )
+        out = self.c_skip(sigma_data, padded_sigma) * noised_images +  self.c_out(sigma_data, padded_sigma) * net_out
+        if not clamp:
+            return out
+        return self.threshold_x_start(out, dynamic_threshold)
+    # sampling
+    # sample schedule
+    # equation (5) in the paper
+    def sample_schedule(
+        self,
+        num_sample_steps,
+        rho,
+        sigma_min,
+        sigma_max
+    ):
+        N = num_sample_steps
+        inv_rho = 1 / rho
+        steps = torch.arange(num_sample_steps, device = self.device, dtype = torch.float32)
+        sigmas = (sigma_max ** inv_rho + steps / (N - 1) * (sigma_min ** inv_rho - sigma_max ** inv_rho)) ** rho
+        sigmas = F.pad(sigmas, (0, 1), value = 0.) # last step is sigma value of 0.
+        return sigmas
+    @torch.no_grad()
+    def one_unet_sample(
+        self,
+        unet,
+        shape,
+        *,
+        unet_number,
+        clamp = True,
+        dynamic_threshold = True,
+        cond_scale = 1.,
+        use_tqdm = True,
+        inpaint_images = None,
+        inpaint_masks = None,
+        inpaint_resample_times = 5,
+        init_images = None,
+        skip_steps = None,
+        sigma_min = None,
+        sigma_max = None,
+        **kwargs
+    ):
+        # get specific sampling hyperparameters for unet
+        hp = self.hparams[unet_number - 1]
+        sigma_min = default(sigma_min, hp.sigma_min)
+        sigma_max = default(sigma_max, hp.sigma_max)
+        # get the schedule, which is returned as (sigma, gamma) tuple, and pair up with the next sigma and gamma
+        sigmas = self.sample_schedule(hp.num_sample_steps, hp.rho, sigma_min, sigma_max)
+        gammas = torch.where(
+            (sigmas >= hp.S_tmin) & (sigmas <= hp.S_tmax),
+            min(hp.S_churn / hp.num_sample_steps, sqrt(2) - 1),
+            0.
+        )
+        sigmas_and_gammas = list(zip(sigmas[:-1], sigmas[1:], gammas[:-1]))
+        # images is noise at the beginning
+        init_sigma = sigmas[0]
+        images = init_sigma * torch.randn(shape, device = self.device)
+        # initializing with an image
+        if exists(init_images):
+            images += init_images
+        # keeping track of x0, for self conditioning if needed
+        x_start = None
+        # prepare inpainting images and mask
+        has_inpainting = exists(inpaint_images) and exists(inpaint_masks)
+        resample_times = inpaint_resample_times if has_inpainting else 1
+        if has_inpainting:
+            inpaint_images = self.normalize_img(inpaint_images)
+            inpaint_images = self.resize_to(inpaint_images, shape[-1])
+            inpaint_masks = self.resize_to(rearrange(inpaint_masks, 'b ... -> b 1 ...').float(), shape[-1]).bool()
+        # unet kwargs
+        unet_kwargs = dict(
+            sigma_data = hp.sigma_data,
+            clamp = clamp,
+            dynamic_threshold = dynamic_threshold,
+            cond_scale = cond_scale,
+            **kwargs
+        )
+        # gradually denoise
+        initial_step = default(skip_steps, 0)
+        sigmas_and_gammas = sigmas_and_gammas[initial_step:]
+        total_steps = len(sigmas_and_gammas)
+        for ind, (sigma, sigma_next, gamma) in tqdm(enumerate(sigmas_and_gammas), total = total_steps, desc = 'sampling time step', disable = not use_tqdm):
+            is_last_timestep = ind == (total_steps - 1)
+            sigma, sigma_next, gamma = map(lambda t: t.item(), (sigma, sigma_next, gamma))
+            for r in reversed(range(resample_times)):
+                is_last_resample_step = r == 0
+                eps = hp.S_noise * torch.randn(shape, device = self.device) # stochastic sampling
+                sigma_hat = sigma + gamma * sigma
+                added_noise = sqrt(sigma_hat ** 2 - sigma ** 2) * eps
+                images_hat = images + added_noise
+                self_cond = x_start if unet.self_cond else None
+                if has_inpainting:
+                    images_hat = images_hat * ~inpaint_masks + (inpaint_images + added_noise) * inpaint_masks
+                model_output = self.preconditioned_network_forward(
+                    unet.forward_with_cond_scale,
+                    images_hat,
+                    sigma_hat,
+                    self_cond = self_cond,
+                    **unet_kwargs
+                )
+                denoised_over_sigma = (images_hat - model_output) / sigma_hat
+                images_next = images_hat + (sigma_next - sigma_hat) * denoised_over_sigma
+                # second order correction, if not the last timestep
+                if sigma_next != 0:
+                    self_cond = model_output if unet.self_cond else None
+                    model_output_next = self.preconditioned_network_forward(
+                        unet.forward_with_cond_scale,
+                        images_next,
+                        sigma_next,
+                        self_cond = self_cond,
+                        **unet_kwargs
+                    )
+                    denoised_prime_over_sigma = (images_next - model_output_next) / sigma_next
+                    images_next = images_hat + 0.5 * (sigma_next - sigma_hat) * (denoised_over_sigma + denoised_prime_over_sigma)
+                images = images_next
+                if has_inpainting and not (is_last_resample_step or is_last_timestep):
+                    # renoise in repaint and then resample
+                    repaint_noise = torch.randn(shape, device = self.device)
+                    images = images + (sigma - sigma_next) * repaint_noise
+                x_start = model_output  # save model output for self conditioning
+        images = images.clamp(-1., 1.)
+        if has_inpainting:
+            images = images * ~inpaint_masks + inpaint_images * inpaint_masks
+        return self.unnormalize_img(images)
+    @torch.no_grad()
+    @eval_decorator
+    def sample(
+        self,
+        texts: List[str] = None,
+        text_masks = None,
+        text_embeds = None,
+        cond_images = None,
+        inpaint_images = None,
+        inpaint_masks = None,
+        inpaint_resample_times = 5,
+        init_images = None,
+        skip_steps = None,
+        sigma_min = None,
+        sigma_max = None,
+        video_frames = None,
+        batch_size = 1,
+        cond_scale = 1.,
+        lowres_sample_noise_level = None,
+        start_at_unet_number = 1,
+        start_image_or_video = None,
+        stop_at_unet_number = None,
+        return_all_unet_outputs = False,
+        return_pil_images = False,
+        use_tqdm = True,
+        device = None,
+    ):
+        device = default(device, self.device)
+        self.reset_unets_all_one_device(device = device)
+        cond_images = maybe(cast_uint8_images_to_float)(cond_images)
+        if exists(texts) and not exists(text_embeds) and not self.unconditional:
+            assert all([*map(len, texts)]), 'text cannot be empty'
+            with autocast(enabled = False):
+                text_embeds, text_masks = self.encode_text(texts, return_attn_mask = True)
+            text_embeds, text_masks = map(lambda t: t.to(device), (text_embeds, text_masks))
+        if not self.unconditional:
+            assert exists(text_embeds), 'text must be passed in if the network was not trained without text `condition_on_text` must be set to `False` when training'
+            text_masks = default(text_masks, lambda: torch.any(text_embeds != 0., dim = -1))
+            batch_size = text_embeds.shape[0]
+        if exists(inpaint_images):
+            if self.unconditional:
+                if batch_size == 1: # assume researcher wants to broadcast along inpainted images
+                    batch_size = inpaint_images.shape[0]
+            assert inpaint_images.shape[0] == batch_size, 'number of inpainting images must be equal to the specified batch size on sample `sample(batch_size=<int>)``'
+            assert not (self.condition_on_text and inpaint_images.shape[0] != text_embeds.shape[0]), 'number of inpainting images must be equal to the number of text to be conditioned on'
+        assert not (self.condition_on_text and not exists(text_embeds)), 'text or text encodings must be passed into imagen if specified'
+        assert not (not self.condition_on_text and exists(text_embeds)), 'imagen specified not to be conditioned on text, yet it is presented'
+        assert not (exists(text_embeds) and text_embeds.shape[-1] != self.text_embed_dim), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'
+        assert not (exists(inpaint_images) ^ exists(inpaint_masks)),  'inpaint images and masks must be both passed in to do inpainting'
+        outputs = []
+        is_cuda = next(self.parameters()).is_cuda
+        device = next(self.parameters()).device
+        lowres_sample_noise_level = default(lowres_sample_noise_level, self.lowres_sample_noise_level)
+        num_unets = len(self.unets)
+        cond_scale = cast_tuple(cond_scale, num_unets)
+        # handle video and frame dimension
+        assert not (self.is_video and not exists(video_frames)), 'video_frames must be passed in on sample time if training on video'
+        frame_dims = (video_frames,) if self.is_video else tuple()
+        # initializing with an image or video
+        init_images = cast_tuple(init_images, num_unets)
+        init_images = [maybe(self.normalize_img)(init_image) for init_image in init_images]
+        skip_steps = cast_tuple(skip_steps, num_unets)
+        sigma_min = cast_tuple(sigma_min, num_unets)
+        sigma_max = cast_tuple(sigma_max, num_unets)
+        # handle starting at a unet greater than 1, for training only-upscaler training
+        if start_at_unet_number > 1:
+            assert start_at_unet_number <= num_unets, 'must start a unet that is less than the total number of unets'
+            assert not exists(stop_at_unet_number) or start_at_unet_number <= stop_at_unet_number
+            assert exists(start_image_or_video), 'starting image or video must be supplied if only doing upscaling'
+            prev_image_size = self.image_sizes[start_at_unet_number - 2]
+            img = self.resize_to(start_image_or_video, prev_image_size)
+        # go through each unet in cascade
+        for unet_number, unet, channel, image_size, unet_hparam, dynamic_threshold, unet_cond_scale, unet_init_images, unet_skip_steps, unet_sigma_min, unet_sigma_max in tqdm(zip(range(1, num_unets + 1), self.unets, self.sample_channels, self.image_sizes, self.hparams, self.dynamic_thresholding, cond_scale, init_images, skip_steps, sigma_min, sigma_max), disable = not use_tqdm):
+            if unet_number < start_at_unet_number:
+                continue
+            assert not isinstance(unet, NullUnet), 'cannot sample from null unet'
+            context = self.one_unet_in_gpu(unet = unet) if is_cuda else nullcontext()
+            with context:
+                lowres_cond_img = lowres_noise_times = None
+                shape = (batch_size, channel, *frame_dims, image_size, image_size)
+                if unet.lowres_cond:
+                    lowres_noise_times = self.lowres_noise_schedule.get_times(batch_size, lowres_sample_noise_level, device = device)
+                    lowres_cond_img = self.resize_to(img, image_size)
+                    lowres_cond_img = self.normalize_img(lowres_cond_img)
+                    lowres_cond_img, _ = self.lowres_noise_schedule.q_sample(x_start = lowres_cond_img, t = lowres_noise_times, noise = torch.randn_like(lowres_cond_img))
+                if exists(unet_init_images):
+                    unet_init_images = self.resize_to(unet_init_images, image_size)
+                shape = (batch_size, self.channels, *frame_dims, image_size, image_size)
+                img = self.one_unet_sample(
+                    unet,
+                    shape,
+                    unet_number = unet_number,
+                    text_embeds = text_embeds,
+                    text_mask = text_masks,
+                    cond_images = cond_images,
+                    inpaint_images = inpaint_images,
+                    inpaint_masks = inpaint_masks,
+                    inpaint_resample_times = inpaint_resample_times,
+                    init_images = unet_init_images,
+                    skip_steps = unet_skip_steps,
+                    sigma_min = unet_sigma_min,
+                    sigma_max = unet_sigma_max,
+                    cond_scale = unet_cond_scale,
+                    lowres_cond_img = lowres_cond_img,
+                    lowres_noise_times = lowres_noise_times,
+                    dynamic_threshold = dynamic_threshold,
+                    use_tqdm = use_tqdm
+                )
+                outputs.append(img)
+            if exists(stop_at_unet_number) and stop_at_unet_number == unet_number:
+                break
+        output_index = -1 if not return_all_unet_outputs else slice(None) # either return last unet output or all unet outputs
+        if not return_pil_images:
+            return outputs[output_index]
+        if not return_all_unet_outputs:
+            outputs = outputs[-1:]
+        assert not self.is_video, 'automatically converting video tensor to video file for saving is not built yet'
+        pil_images = list(map(lambda img: list(map(T.ToPILImage(), img.unbind(dim = 0))), outputs))
+        return pil_images[output_index] # now you have a bunch of pillow images you can just .save(/where/ever/you/want.png)
+    # training
+    def loss_weight(self, sigma_data, sigma):
+        return (sigma ** 2 + sigma_data ** 2) * (sigma * sigma_data) ** -2
+    def noise_distribution(self, P_mean, P_std, batch_size):
+        return (P_mean + P_std * torch.randn((batch_size,), device = self.device)).exp()
+    def forward(
+        self,
+        images,
+        unet: Union[Unet, Unet3D, NullUnet, DistributedDataParallel] = None,
+        texts: List[str] = None,
+        text_embeds = None,
+        text_masks = None,
+        unet_number = None,
+        cond_images = None
+    ):
+        assert images.shape[-1] == images.shape[-2], f'the images you pass in must be a square, but received dimensions of {images.shape[2]}, {images.shape[-1]}'
+        assert not (len(self.unets) > 1 and not exists(unet_number)), f'you must specify which unet you want trained, from a range of 1 to {len(self.unets)}, if you are training cascading DDPM (multiple unets)'
+        unet_number = default(unet_number, 1)
+        assert not exists(self.only_train_unet_number) or self.only_train_unet_number == unet_number, 'you can only train on unet #{self.only_train_unet_number}'
+        images = cast_uint8_images_to_float(images)
+        cond_images = maybe(cast_uint8_images_to_float)(cond_images)
+        assert is_float_dtype(images.dtype), f'images tensor needs to be floats but {images.dtype} dtype found instead'
+        unet_index = unet_number - 1
+        unet = default(unet, lambda: self.get_unet(unet_number))
+        assert not isinstance(unet, NullUnet), 'null unet cannot and should not be trained'
+        target_image_size    = self.image_sizes[unet_index]
+        random_crop_size     = self.random_crop_sizes[unet_index]
+        prev_image_size      = self.image_sizes[unet_index - 1] if unet_index > 0 else None
+        hp                   = self.hparams[unet_index]
+        batch_size, c, *_, h, w, device, is_video = *images.shape, images.device, (images.ndim == 5)
+        frames = images.shape[2] if is_video else None
+        check_shape(images, 'b c ...', c = self.channels)
+        assert h >= target_image_size and w >= target_image_size
+        if exists(texts) and not exists(text_embeds) and not self.unconditional:
+            assert all([*map(len, texts)]), 'text cannot be empty'
+            assert len(texts) == len(images), 'number of text captions does not match up with the number of images given'
+            with autocast(enabled = False):
+                text_embeds, text_masks = self.encode_text(texts, return_attn_mask = True)
+            text_embeds, text_masks = map(lambda t: t.to(images.device), (text_embeds, text_masks))
+        if not self.unconditional:
+            text_masks = default(text_masks, lambda: torch.any(text_embeds != 0., dim = -1))
+        assert not (self.condition_on_text and not exists(text_embeds)), 'text or text encodings must be passed into decoder if specified'
+        assert not (not self.condition_on_text and exists(text_embeds)), 'decoder specified not to be conditioned on text, yet it is presented'
+        assert not (exists(text_embeds) and text_embeds.shape[-1] != self.text_embed_dim), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'
+        lowres_cond_img = lowres_aug_times = None
+        if exists(prev_image_size):
+            lowres_cond_img = self.resize_to(images, prev_image_size, clamp_range = self.input_image_range)
+            lowres_cond_img = self.resize_to(lowres_cond_img, target_image_size, clamp_range = self.input_image_range)
+            if self.per_sample_random_aug_noise_level:
+                lowres_aug_times = self.lowres_noise_schedule.sample_random_times(batch_size, device = device)
+            else:
+                lowres_aug_time = self.lowres_noise_schedule.sample_random_times(1, device = device)
+                lowres_aug_times = repeat(lowres_aug_time, '1 -> b', b = batch_size)
+        images = self.resize_to(images, target_image_size)
+        # normalize to [-1, 1]
+        images = self.normalize_img(images)
+        lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img)
+        # random cropping during training
+        # for upsamplers
+        if exists(random_crop_size):
+            aug = K.RandomCrop((random_crop_size, random_crop_size), p = 1.)
+            if is_video:
+                images, lowres_cond_img = rearrange_many((images, lowres_cond_img), 'b c f h w -> (b f) c h w')
+            # make sure low res conditioner and image both get augmented the same way
+            # detailed https://kornia.readthedocs.io/en/latest/augmentation.module.html?highlight=randomcrop#kornia.augmentation.RandomCrop
+            images = aug(images)
+            lowres_cond_img = aug(lowres_cond_img, params = aug._params)
+            if is_video:
+                images, lowres_cond_img = rearrange_many((images, lowres_cond_img), '(b f) c h w -> b c f h w', f = frames)
+        # noise the lowres conditioning image
+        # at sample time, they then fix the noise level of 0.1 - 0.3
+        lowres_cond_img_noisy = None
+        if exists(lowres_cond_img):
+            lowres_cond_img_noisy, _ = self.lowres_noise_schedule.q_sample(x_start = lowres_cond_img, t = lowres_aug_times, noise = torch.randn_like(lowres_cond_img))
+        # get the sigmas
+        sigmas = self.noise_distribution(hp.P_mean, hp.P_std, batch_size)
+        padded_sigmas = self.right_pad_dims_to_datatype(sigmas)
+        # noise
+        noise = torch.randn_like(images)
+        noised_images = images + padded_sigmas * noise  # alphas are 1. in the paper
+        # unet kwargs
+        unet_kwargs = dict(
+            sigma_data = hp.sigma_data,
+            text_embeds = text_embeds,
+            text_mask = text_masks,
+            cond_images = cond_images,
+            lowres_noise_times = self.lowres_noise_schedule.get_condition(lowres_aug_times),
+            lowres_cond_img = lowres_cond_img_noisy,
+            cond_drop_prob = self.cond_drop_prob,
+        )
+        # self conditioning - https://arxiv.org/abs/2208.04202 - training will be 25% slower
+        # Because 'unet' can be an instance of DistributedDataParallel coming from the
+        # ImagenTrainer.unet_being_trained when invoking ImagenTrainer.forward(), we need to
+        # access the member 'module' of the wrapped unet instance.
+        self_cond = unet.module.self_cond if isinstance(unet, DistributedDataParallel) else unet
+        if self_cond and random() < 0.5:
+            with torch.no_grad():
+                pred_x0 = self.preconditioned_network_forward(
+                    unet.forward,
+                    noised_images,
+                    sigmas,
+                    **unet_kwargs
+                ).detach()
+            unet_kwargs = {**unet_kwargs, 'self_cond': pred_x0}
+        # get prediction
+        denoised_images = self.preconditioned_network_forward(
+            unet.forward,
+            noised_images,
+            sigmas,
+            **unet_kwargs
+        )
+        # losses
+        losses = F.mse_loss(denoised_images, images, reduction = 'none')
+        losses = reduce(losses, 'b ... -> b', 'mean')
+        # loss weighting
+        losses = losses * self.loss_weight(hp.sigma_data, sigmas)
+        # return average loss
+        return losses.mean()

imagen_pytorch/imagen_pytorch.py ADDED Viewed

	@@ -0,0 +1,2515 @@

+import math
+import copy
+from random import random
+from typing import List, Union
+from tqdm.auto import tqdm
+from functools import partial, wraps
+from contextlib import contextmanager, nullcontext
+from collections import namedtuple
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+from torch.nn.parallel import DistributedDataParallel
+from torch import nn, einsum
+from torch.cuda.amp import autocast
+from torch.special import expm1
+import torchvision.transforms as T
+import kornia.augmentation as K
+from einops import rearrange, repeat, reduce
+from einops.layers.torch import Rearrange, Reduce
+from einops_exts import rearrange_many, repeat_many, check_shape
+from einops_exts.torch import EinopsToAndFrom
+from imagen_pytorch.t5 import t5_encode_text, get_encoded_dim, DEFAULT_T5_NAME
+from imagen_pytorch.imagen_video.imagen_video import Unet3D, resize_video_to
+# helper functions
+def exists(val):
+    return val is not None
+def identity(t, *args, **kwargs):
+    return t
+def first(arr, d = None):
+    if len(arr) == 0:
+        return d
+    return arr[0]
+def maybe(fn):
+    @wraps(fn)
+    def inner(x):
+        if not exists(x):
+            return x
+        return fn(x)
+    return inner
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def cast_tuple(val, length = None):
+    if isinstance(val, list):
+        val = tuple(val)
+    output = val if isinstance(val, tuple) else ((val,) * default(length, 1))
+    if exists(length):
+        assert len(output) == length
+    return output
+def is_float_dtype(dtype):
+    return any([dtype == float_dtype for float_dtype in (torch.float64, torch.float32, torch.float16, torch.bfloat16)])
+def cast_uint8_images_to_float(images):
+    if not images.dtype == torch.uint8:
+        return images
+    return images / 255
+def module_device(module):
+    return next(module.parameters()).device
+def zero_init_(m):
+    nn.init.zeros_(m.weight)
+    if exists(m.bias):
+        nn.init.zeros_(m.bias)
+def eval_decorator(fn):
+    def inner(model, *args, **kwargs):
+        was_training = model.training
+        model.eval()
+        out = fn(model, *args, **kwargs)
+        model.train(was_training)
+        return out
+    return inner
+def pad_tuple_to_length(t, length, fillvalue = None):
+    remain_length = length - len(t)
+    if remain_length <= 0:
+        return t
+    return (*t, *((fillvalue,) * remain_length))
+# helper classes
+class Identity(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+# tensor helpers
+def log(t, eps: float = 1e-12):
+    return torch.log(t.clamp(min = eps))
+def l2norm(t):
+    return F.normalize(t, dim = -1)
+def right_pad_dims_to(x, t):
+    padding_dims = x.ndim - t.ndim
+    if padding_dims <= 0:
+        return t
+    return t.view(*t.shape, *((1,) * padding_dims))
+def masked_mean(t, *, dim, mask = None):
+    if not exists(mask):
+        return t.mean(dim = dim)
+    denom = mask.sum(dim = dim, keepdim = True)
+    mask = rearrange(mask, 'b n -> b n 1')
+    masked_t = t.masked_fill(~mask, 0.)
+    return masked_t.sum(dim = dim) / denom.clamp(min = 1e-5)
+def resize_image_to(
+    image,
+    target_image_size,
+    clamp_range = None
+):
+    orig_image_size = image.shape[-1]
+    if orig_image_size == target_image_size:
+        return image
+    out = F.interpolate(image, target_image_size, mode = 'nearest')
+    if exists(clamp_range):
+        out = out.clamp(*clamp_range)
+    return out
+# image normalization functions
+# ddpms expect images to be in the range of -1 to 1
+def normalize_neg_one_to_one(img):
+    return img * 2 - 1
+def unnormalize_zero_to_one(normed_img):
+    return (normed_img + 1) * 0.5
+# classifier free guidance functions
+def prob_mask_like(shape, prob, device):
+    if prob == 1:
+        return torch.ones(shape, device = device, dtype = torch.bool)
+    elif prob == 0:
+        return torch.zeros(shape, device = device, dtype = torch.bool)
+    else:
+        return torch.zeros(shape, device = device).float().uniform_(0, 1) < prob
+# gaussian diffusion with continuous time helper functions and classes
+# large part of this was thanks to @crowsonkb at https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/utils.py
+@torch.jit.script
+def beta_linear_log_snr(t):
+    return -torch.log(expm1(1e-4 + 10 * (t ** 2)))
+@torch.jit.script
+def alpha_cosine_log_snr(t, s: float = 0.008):
+    return -log((torch.cos((t + s) / (1 + s) * math.pi * 0.5) ** -2) - 1, eps = 1e-5) # not sure if this accounts for beta being clipped to 0.999 in discrete version
+def log_snr_to_alpha_sigma(log_snr):
+    return torch.sqrt(torch.sigmoid(log_snr)), torch.sqrt(torch.sigmoid(-log_snr))
+class GaussianDiffusionContinuousTimes(nn.Module):
+    def __init__(self, *, noise_schedule, timesteps = 1000):
+        super().__init__()
+        if noise_schedule == "linear":
+            self.log_snr = beta_linear_log_snr
+        elif noise_schedule == "cosine":
+            self.log_snr = alpha_cosine_log_snr
+        else:
+            raise ValueError(f'invalid noise schedule {noise_schedule}')
+        self.num_timesteps = timesteps
+    def get_times(self, batch_size, noise_level, *, device):
+        return torch.full((batch_size,), noise_level, device = device, dtype = torch.float32)
+    def sample_random_times(self, batch_size, max_thres = 0.999, *, device):
+        return torch.zeros((batch_size,), device = device).float().uniform_(0, max_thres)
+    def get_condition(self, times):
+        return maybe(self.log_snr)(times)
+    def get_sampling_timesteps(self, batch, *, device):
+        times = torch.linspace(1., 0., self.num_timesteps + 1, device = device)
+        times = repeat(times, 't -> b t', b = batch)
+        times = torch.stack((times[:, :-1], times[:, 1:]), dim = 0)
+        times = times.unbind(dim = -1)
+        return times
+    def q_posterior(self, x_start, x_t, t, *, t_next = None):
+        t_next = default(t_next, lambda: (t - 1. / self.num_timesteps).clamp(min = 0.))
+        """ https://openreview.net/attachment?id=2LdBqxc1Yv&name=supplementary_material """
+        log_snr = self.log_snr(t)
+        log_snr_next = self.log_snr(t_next)
+        log_snr, log_snr_next = map(partial(right_pad_dims_to, x_t), (log_snr, log_snr_next))
+        alpha, sigma = log_snr_to_alpha_sigma(log_snr)
+        alpha_next, sigma_next = log_snr_to_alpha_sigma(log_snr_next)
+        # c - as defined near eq 33
+        c = -expm1(log_snr - log_snr_next)
+        posterior_mean = alpha_next * (x_t * (1 - c) / alpha + c * x_start)
+        # following (eq. 33)
+        posterior_variance = (sigma_next ** 2) * c
+        posterior_log_variance_clipped = log(posterior_variance, eps = 1e-20)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def q_sample(self, x_start, t, noise = None):
+        dtype = x_start.dtype
+        if isinstance(t, float):
+            batch = x_start.shape[0]
+            t = torch.full((batch,), t, device = x_start.device, dtype = dtype)
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        log_snr = self.log_snr(t).type(dtype)
+        log_snr_padded_dim = right_pad_dims_to(x_start, log_snr)
+        alpha, sigma =  log_snr_to_alpha_sigma(log_snr_padded_dim)
+        return alpha * x_start + sigma * noise, log_snr
+    def q_sample_from_to(self, x_from, from_t, to_t, noise = None):
+        shape, device, dtype = x_from.shape, x_from.device, x_from.dtype
+        batch = shape[0]
+        if isinstance(from_t, float):
+            from_t = torch.full((batch,), from_t, device = device, dtype = dtype)
+        if isinstance(to_t, float):
+            to_t = torch.full((batch,), to_t, device = device, dtype = dtype)
+        noise = default(noise, lambda: torch.randn_like(x_from))
+        log_snr = self.log_snr(from_t)
+        log_snr_padded_dim = right_pad_dims_to(x_from, log_snr)
+        alpha, sigma =  log_snr_to_alpha_sigma(log_snr_padded_dim)
+        log_snr_to = self.log_snr(to_t)
+        log_snr_padded_dim_to = right_pad_dims_to(x_from, log_snr_to)
+        alpha_to, sigma_to =  log_snr_to_alpha_sigma(log_snr_padded_dim_to)
+        return x_from * (alpha_to / alpha) + noise * (sigma_to * alpha - sigma * alpha_to) / alpha
+    def predict_start_from_noise(self, x_t, t, noise):
+        log_snr = self.log_snr(t)
+        log_snr = right_pad_dims_to(x_t, log_snr)
+        alpha, sigma = log_snr_to_alpha_sigma(log_snr)
+        return (x_t - sigma * noise) / alpha.clamp(min = 1e-8)
+# norms and residuals
+class LayerNorm(nn.Module):
+    def __init__(self, feats, stable = False, dim = -1):
+        super().__init__()
+        self.stable = stable
+        self.dim = dim
+        self.g = nn.Parameter(torch.ones(feats, *((1,) * (-dim - 1))))
+    def forward(self, x):
+        dtype, dim = x.dtype, self.dim
+        if self.stable:
+            x = x / x.amax(dim = dim, keepdim = True).detach()
+        eps = 1e-5 if x.dtype == torch.float32 else 1e-3
+        var = torch.var(x, dim = dim, unbiased = False, keepdim = True)
+        mean = torch.mean(x, dim = dim, keepdim = True)
+        return (x - mean) * (var + eps).rsqrt().type(dtype) * self.g.type(dtype)
+ChanLayerNorm = partial(LayerNorm, dim = -3)
+class Always():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, *args, **kwargs):
+        return self.val
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) + x
+class Parallel(nn.Module):
+    def __init__(self, *fns):
+        super().__init__()
+        self.fns = nn.ModuleList(fns)
+    def forward(self, x):
+        outputs = [fn(x) for fn in self.fns]
+        return sum(outputs)
+# attention pooling
+class PerceiverAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head = 64,
+        heads = 8,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5 if not cosine_sim_attn else 1
+        self.cosine_sim_attn = cosine_sim_attn
+        self.cosine_sim_scale = 16 if cosine_sim_attn else 1
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim, bias = False),
+            nn.LayerNorm(dim)
+        )
+    def forward(self, x, latents, mask = None):
+        x = self.norm(x)
+        latents = self.norm_latents(latents)
+        b, h = x.shape[0], self.heads
+        q = self.to_q(latents)
+        # the paper differs from Perceiver in which they also concat the key / values derived from the latents to be attended to
+        kv_input = torch.cat((x, latents), dim = -2)
+        k, v = self.to_kv(kv_input).chunk(2, dim = -1)
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h = h)
+        q = q * self.scale
+        # cosine sim attention
+        if self.cosine_sim_attn:
+            q, k = map(l2norm, (q, k))
+        # similarities and masking
+        sim = einsum('... i d, ... j d  -> ... i j', q, k) * self.cosine_sim_scale
+        if exists(mask):
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = F.pad(mask, (0, latents.shape[-2]), value = True)
+            mask = rearrange(mask, 'b j -> b 1 1 j')
+            sim = sim.masked_fill(~mask, max_neg_value)
+        # attention
+        attn = sim.softmax(dim = -1, dtype = torch.float32)
+        attn = attn.to(sim.dtype)
+        out = einsum('... i j, ... j d -> ... i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)', h = h)
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        dim_head = 64,
+        heads = 8,
+        num_latents = 64,
+        num_latents_mean_pooled = 4, # number of latents derived from mean pooled representation of the sequence
+        max_seq_len = 512,
+        ff_mult = 4,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.pos_emb = nn.Embedding(max_seq_len, dim)
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.to_latents_from_mean_pooled_seq = None
+        if num_latents_mean_pooled > 0:
+            self.to_latents_from_mean_pooled_seq = nn.Sequential(
+                LayerNorm(dim),
+                nn.Linear(dim, dim * num_latents_mean_pooled),
+                Rearrange('b (n d) -> b n d', n = num_latents_mean_pooled)
+            )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                PerceiverAttention(dim = dim, dim_head = dim_head, heads = heads, cosine_sim_attn = cosine_sim_attn),
+                FeedForward(dim = dim, mult = ff_mult)
+            ]))
+    def forward(self, x, mask = None):
+        n, device = x.shape[1], x.device
+        pos_emb = self.pos_emb(torch.arange(n, device = device))
+        x_with_pos = x + pos_emb
+        latents = repeat(self.latents, 'n d -> b n d', b = x.shape[0])
+        if exists(self.to_latents_from_mean_pooled_seq):
+            meanpooled_seq = masked_mean(x, dim = 1, mask = torch.ones(x.shape[:2], device = x.device, dtype = torch.bool))
+            meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
+            latents = torch.cat((meanpooled_latents, latents), dim = -2)
+        for attn, ff in self.layers:
+            latents = attn(x_with_pos, latents, mask = mask) + latents
+            latents = ff(latents) + latents
+        return latents
+# attention
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        dim_head = 64,
+        heads = 8,
+        context_dim = None,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5 if not cosine_sim_attn else 1.
+        self.cosine_sim_attn = cosine_sim_attn
+        self.cosine_sim_scale = 16 if cosine_sim_attn else 1
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = LayerNorm(dim)
+        self.null_kv = nn.Parameter(torch.randn(2, dim_head))
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, dim_head * 2, bias = False)
+        self.to_context = nn.Sequential(nn.LayerNorm(context_dim), nn.Linear(context_dim, dim_head * 2)) if exists(context_dim) else None
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim, bias = False),
+            LayerNorm(dim)
+        )
+    def forward(self, x, context = None, mask = None, attn_bias = None):
+        b, n, device = *x.shape[:2], x.device
+        x = self.norm(x)
+        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim = -1))
+        q = rearrange(q, 'b n (h d) -> b h n d', h = self.heads)
+        q = q * self.scale
+        # add null key / value for classifier free guidance in prior net
+        nk, nv = repeat_many(self.null_kv.unbind(dim = -2), 'd -> b 1 d', b = b)
+        k = torch.cat((nk, k), dim = -2)
+        v = torch.cat((nv, v), dim = -2)
+        # add text conditioning, if present
+        if exists(context):
+            assert exists(self.to_context)
+            ck, cv = self.to_context(context).chunk(2, dim = -1)
+            k = torch.cat((ck, k), dim = -2)
+            v = torch.cat((cv, v), dim = -2)
+        # cosine sim attention
+        if self.cosine_sim_attn:
+            q, k = map(l2norm, (q, k))
+        # calculate query / key similarities
+        sim = einsum('b h i d, b j d -> b h i j', q, k) * self.cosine_sim_scale
+        # relative positional encoding (T5 style)
+        if exists(attn_bias):
+            sim = sim + attn_bias
+        # masking
+        max_neg_value = -torch.finfo(sim.dtype).max
+        if exists(mask):
+            mask = F.pad(mask, (1, 0), value = True)
+            mask = rearrange(mask, 'b j -> b 1 1 j')
+            sim = sim.masked_fill(~mask, max_neg_value)
+        # attention
+        attn = sim.softmax(dim = -1, dtype = torch.float32)
+        attn = attn.to(sim.dtype)
+        # aggregate values
+        out = einsum('b h i j, b j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+# decoder
+def Upsample(dim, dim_out = None):
+    dim_out = default(dim_out, dim)
+    return nn.Sequential(
+        nn.Upsample(scale_factor = 2, mode = 'nearest'),
+        nn.Conv2d(dim, dim_out, 3, padding = 1)
+    )
+class PixelShuffleUpsample(nn.Module):
+    """
+    code shared by @MalumaDev at DALLE2-pytorch for addressing checkboard artifacts
+    https://arxiv.org/ftp/arxiv/papers/1707/1707.02937.pdf
+    """
+    def __init__(self, dim, dim_out = None):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        conv = nn.Conv2d(dim, dim_out * 4, 1)
+        self.net = nn.Sequential(
+            conv,
+            nn.SiLU(),
+            nn.PixelShuffle(2)
+        )
+        self.init_conv_(conv)
+    def init_conv_(self, conv):
+        o, i, h, w = conv.weight.shape
+        conv_weight = torch.empty(o // 4, i, h, w)
+        nn.init.kaiming_uniform_(conv_weight)
+        conv_weight = repeat(conv_weight, 'o ... -> (o 4) ...')
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def forward(self, x):
+        return self.net(x)
+def Downsample(dim, dim_out = None):
+    # https://arxiv.org/abs/2208.03641 shows this is the most optimal way to downsample
+    # named SP-conv in the paper, but basically a pixel unshuffle
+    dim_out = default(dim_out, dim)
+    return nn.Sequential(
+        Rearrange('b c (h s1) (w s2) -> b (c s1 s2) h w', s1 = 2, s2 = 2),
+        nn.Conv2d(dim * 4, dim_out, 1)
+    )
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device = x.device) * -emb)
+        emb = rearrange(x, 'i -> i 1') * rearrange(emb, 'j -> 1 j')
+        return torch.cat((emb.sin(), emb.cos()), dim = -1)
+class LearnedSinusoidalPosEmb(nn.Module):
+    """ following @crowsonkb 's lead with learned sinusoidal pos emb """
+    """ https://github.com/crowsonkb/v-diffusion-jax/blob/master/diffusion/models/danbooru_128.py#L8 """
+    def __init__(self, dim):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        self.weights = nn.Parameter(torch.randn(half_dim))
+    def forward(self, x):
+        x = rearrange(x, 'b -> b 1')
+        freqs = x * rearrange(self.weights, 'd -> 1 d') * 2 * math.pi
+        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim = -1)
+        fouriered = torch.cat((x, fouriered), dim = -1)
+        return fouriered
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        groups = 8,
+        norm = True
+    ):
+        super().__init__()
+        self.groupnorm = nn.GroupNorm(groups, dim) if norm else Identity()
+        self.activation = nn.SiLU()
+        self.project = nn.Conv2d(dim, dim_out, 3, padding = 1)
+    def forward(self, x, scale_shift = None):
+        x = self.groupnorm(x)
+        if exists(scale_shift):
+            scale, shift = scale_shift
+            x = x * (scale + 1) + shift
+        x = self.activation(x)
+        return self.project(x)
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        *,
+        cond_dim = None,
+        time_cond_dim = None,
+        groups = 8,
+        linear_attn = False,
+        use_gca = False,
+        squeeze_excite = False,
+        **attn_kwargs
+    ):
+        super().__init__()
+        self.time_mlp = None
+        if exists(time_cond_dim):
+            self.time_mlp = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, dim_out * 2)
+            )
+        self.cross_attn = None
+        if exists(cond_dim):
+            attn_klass = CrossAttention if not linear_attn else LinearCrossAttention
+            self.cross_attn = EinopsToAndFrom(
+                'b c h w',
+                'b (h w) c',
+                attn_klass(
+                    dim = dim_out,
+                    context_dim = cond_dim,
+                    **attn_kwargs
+                )
+            )
+        self.block1 = Block(dim, dim_out, groups = groups)
+        self.block2 = Block(dim_out, dim_out, groups = groups)
+        self.gca = GlobalContext(dim_in = dim_out, dim_out = dim_out) if use_gca else Always(1)
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else Identity()
+    def forward(self, x, time_emb = None, cond = None):
+        scale_shift = None
+        if exists(self.time_mlp) and exists(time_emb):
+            time_emb = self.time_mlp(time_emb)
+            time_emb = rearrange(time_emb, 'b c -> b c 1 1')
+            scale_shift = time_emb.chunk(2, dim = 1)
+        h = self.block1(x)
+        if exists(self.cross_attn):
+            assert exists(cond)
+            h = self.cross_attn(h, context = cond) + h
+        h = self.block2(h, scale_shift = scale_shift)
+        h = h * self.gca(h)
+        return h + self.res_conv(x)
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        context_dim = None,
+        dim_head = 64,
+        heads = 8,
+        norm_context = False,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5 if not cosine_sim_attn else 1.
+        self.cosine_sim_attn = cosine_sim_attn
+        self.cosine_sim_scale = 16 if cosine_sim_attn else 1
+        self.heads = heads
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, dim)
+        self.norm = LayerNorm(dim)
+        self.norm_context = LayerNorm(context_dim) if norm_context else Identity()
+        self.null_kv = nn.Parameter(torch.randn(2, dim_head))
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim, bias = False),
+            LayerNorm(dim)
+        )
+    def forward(self, x, context, mask = None):
+        b, n, device = *x.shape[:2], x.device
+        x = self.norm(x)
+        context = self.norm_context(context)
+        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h = self.heads)
+        # add null key / value for classifier free guidance in prior net
+        nk, nv = repeat_many(self.null_kv.unbind(dim = -2), 'd -> b h 1 d', h = self.heads,  b = b)
+        k = torch.cat((nk, k), dim = -2)
+        v = torch.cat((nv, v), dim = -2)
+        q = q * self.scale
+        # cosine sim attention
+        if self.cosine_sim_attn:
+            q, k = map(l2norm, (q, k))
+        # similarities
+        sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.cosine_sim_scale
+        # masking
+        max_neg_value = -torch.finfo(sim.dtype).max
+        if exists(mask):
+            mask = F.pad(mask, (1, 0), value = True)
+            mask = rearrange(mask, 'b j -> b 1 1 j')
+            sim = sim.masked_fill(~mask, max_neg_value)
+        attn = sim.softmax(dim = -1, dtype = torch.float32)
+        attn = attn.to(sim.dtype)
+        out = einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+class LinearCrossAttention(CrossAttention):
+    def forward(self, x, context, mask = None):
+        b, n, device = *x.shape[:2], x.device
+        x = self.norm(x)
+        context = self.norm_context(context)
+        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> (b h) n d', h = self.heads)
+        # add null key / value for classifier free guidance in prior net
+        nk, nv = repeat_many(self.null_kv.unbind(dim = -2), 'd -> (b h) 1 d', h = self.heads,  b = b)
+        k = torch.cat((nk, k), dim = -2)
+        v = torch.cat((nv, v), dim = -2)
+        # masking
+        max_neg_value = -torch.finfo(x.dtype).max
+        if exists(mask):
+            mask = F.pad(mask, (1, 0), value = True)
+            mask = rearrange(mask, 'b n -> b n 1')
+            k = k.masked_fill(~mask, max_neg_value)
+            v = v.masked_fill(~mask, 0.)
+        # linear attention
+        q = q.softmax(dim = -1)
+        k = k.softmax(dim = -2)
+        q = q * self.scale
+        context = einsum('b n d, b n e -> b d e', k, v)
+        out = einsum('b n d, b d e -> b n e', q, context)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h = self.heads)
+        return self.to_out(out)
+class LinearAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_head = 32,
+        heads = 8,
+        dropout = 0.05,
+        context_dim = None,
+        **kwargs
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = ChanLayerNorm(dim)
+        self.nonlin = nn.SiLU()
+        self.to_q = nn.Sequential(
+            nn.Dropout(dropout),
+            nn.Conv2d(dim, inner_dim, 1, bias = False),
+            nn.Conv2d(inner_dim, inner_dim, 3, bias = False, padding = 1, groups = inner_dim)
+        )
+        self.to_k = nn.Sequential(
+            nn.Dropout(dropout),
+            nn.Conv2d(dim, inner_dim, 1, bias = False),
+            nn.Conv2d(inner_dim, inner_dim, 3, bias = False, padding = 1, groups = inner_dim)
+        )
+        self.to_v = nn.Sequential(
+            nn.Dropout(dropout),
+            nn.Conv2d(dim, inner_dim, 1, bias = False),
+            nn.Conv2d(inner_dim, inner_dim, 3, bias = False, padding = 1, groups = inner_dim)
+        )
+        self.to_context = nn.Sequential(nn.LayerNorm(context_dim), nn.Linear(context_dim, inner_dim * 2, bias = False)) if exists(context_dim) else None
+        self.to_out = nn.Sequential(
+            nn.Conv2d(inner_dim, dim, 1, bias = False),
+            ChanLayerNorm(dim)
+        )
+    def forward(self, fmap, context = None):
+        h, x, y = self.heads, *fmap.shape[-2:]
+        fmap = self.norm(fmap)
+        q, k, v = map(lambda fn: fn(fmap), (self.to_q, self.to_k, self.to_v))
+        q, k, v = rearrange_many((q, k, v), 'b (h c) x y -> (b h) (x y) c', h = h)
+        if exists(context):
+            assert exists(self.to_context)
+            ck, cv = self.to_context(context).chunk(2, dim = -1)
+            ck, cv = rearrange_many((ck, cv), 'b n (h d) -> (b h) n d', h = h)
+            k = torch.cat((k, ck), dim = -2)
+            v = torch.cat((v, cv), dim = -2)
+        q = q.softmax(dim = -1)
+        k = k.softmax(dim = -2)
+        q = q * self.scale
+        context = einsum('b n d, b n e -> b d e', k, v)
+        out = einsum('b n d, b d e -> b n e', q, context)
+        out = rearrange(out, '(b h) (x y) d -> b (h d) x y', h = h, x = x, y = y)
+        out = self.nonlin(out)
+        return self.to_out(out)
+class GlobalContext(nn.Module):
+    """ basically a superior form of squeeze-excitation that is attention-esque """
+    def __init__(
+        self,
+        *,
+        dim_in,
+        dim_out
+    ):
+        super().__init__()
+        self.to_k = nn.Conv2d(dim_in, 1, 1)
+        hidden_dim = max(3, dim_out // 2)
+        self.net = nn.Sequential(
+            nn.Conv2d(dim_in, hidden_dim, 1),
+            nn.SiLU(),
+            nn.Conv2d(hidden_dim, dim_out, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        context = self.to_k(x)
+        x, context = rearrange_many((x, context), 'b n ... -> b n (...)')
+        out = einsum('b i n, b c n -> b c i', context.softmax(dim = -1), x)
+        out = rearrange(out, '... -> ... 1')
+        return self.net(out)
+def FeedForward(dim, mult = 2):
+    hidden_dim = int(dim * mult)
+    return nn.Sequential(
+        LayerNorm(dim),
+        nn.Linear(dim, hidden_dim, bias = False),
+        nn.GELU(),
+        LayerNorm(hidden_dim),
+        nn.Linear(hidden_dim, dim, bias = False)
+    )
+def ChanFeedForward(dim, mult = 2):  # in paper, it seems for self attention layers they did feedforwards with twice channel width
+    hidden_dim = int(dim * mult)
+    return nn.Sequential(
+        ChanLayerNorm(dim),
+        nn.Conv2d(dim, hidden_dim, 1, bias = False),
+        nn.GELU(),
+        ChanLayerNorm(hidden_dim),
+        nn.Conv2d(hidden_dim, dim, 1, bias = False)
+    )
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        depth = 1,
+        heads = 8,
+        dim_head = 32,
+        ff_mult = 2,
+        context_dim = None,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                EinopsToAndFrom('b c h w', 'b (h w) c', Attention(dim = dim, heads = heads, dim_head = dim_head, context_dim = context_dim, cosine_sim_attn = cosine_sim_attn)),
+                ChanFeedForward(dim = dim, mult = ff_mult)
+            ]))
+    def forward(self, x, context = None):
+        for attn, ff in self.layers:
+            x = attn(x, context = context) + x
+            x = ff(x) + x
+        return x
+class LinearAttentionTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        depth = 1,
+        heads = 8,
+        dim_head = 32,
+        ff_mult = 2,
+        context_dim = None,
+        **kwargs
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                LinearAttention(dim = dim, heads = heads, dim_head = dim_head, context_dim = context_dim),
+                ChanFeedForward(dim = dim, mult = ff_mult)
+            ]))
+    def forward(self, x, context = None):
+        for attn, ff in self.layers:
+            x = attn(x, context = context) + x
+            x = ff(x) + x
+        return x
+class CrossEmbedLayer(nn.Module):
+    def __init__(
+        self,
+        dim_in,
+        kernel_sizes,
+        dim_out = None,
+        stride = 2
+    ):
+        super().__init__()
+        assert all([*map(lambda t: (t % 2) == (stride % 2), kernel_sizes)])
+        dim_out = default(dim_out, dim_in)
+        kernel_sizes = sorted(kernel_sizes)
+        num_scales = len(kernel_sizes)
+        # calculate the dimension at each scale
+        dim_scales = [int(dim_out / (2 ** i)) for i in range(1, num_scales)]
+        dim_scales = [*dim_scales, dim_out - sum(dim_scales)]
+        self.convs = nn.ModuleList([])
+        for kernel, dim_scale in zip(kernel_sizes, dim_scales):
+            self.convs.append(nn.Conv2d(dim_in, dim_scale, kernel, stride = stride, padding = (kernel - stride) // 2))
+    def forward(self, x):
+        fmaps = tuple(map(lambda conv: conv(x), self.convs))
+        return torch.cat(fmaps, dim = 1)
+class UpsampleCombiner(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        enabled = False,
+        dim_ins = tuple(),
+        dim_outs = tuple()
+    ):
+        super().__init__()
+        dim_outs = cast_tuple(dim_outs, len(dim_ins))
+        assert len(dim_ins) == len(dim_outs)
+        self.enabled = enabled
+        if not self.enabled:
+            self.dim_out = dim
+            return
+        self.fmap_convs = nn.ModuleList([Block(dim_in, dim_out) for dim_in, dim_out in zip(dim_ins, dim_outs)])
+        self.dim_out = dim + (sum(dim_outs) if len(dim_outs) > 0 else 0)
+    def forward(self, x, fmaps = None):
+        target_size = x.shape[-1]
+        fmaps = default(fmaps, tuple())
+        if not self.enabled or len(fmaps) == 0 or len(self.fmap_convs) == 0:
+            return x
+        fmaps = [resize_image_to(fmap, target_size) for fmap in fmaps]
+        outs = [conv(fmap) for fmap, conv in zip(fmaps, self.fmap_convs)]
+        return torch.cat((x, *outs), dim = 1)
+class Unet(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        image_embed_dim = 1024,
+        text_embed_dim = get_encoded_dim(DEFAULT_T5_NAME),
+        num_resnet_blocks = 1,
+        cond_dim = None,
+        num_image_tokens = 4,
+        num_time_tokens = 2,
+        learned_sinu_pos_emb_dim = 16,
+        out_dim = None,
+        dim_mults=(1, 2, 4, 8),
+        cond_images_channels = 0,
+        channels = 3,
+        channels_out = None,
+        attn_dim_head = 64,
+        attn_heads = 8,
+        ff_mult = 2.,
+        lowres_cond = False,                # for cascading diffusion - https://cascaded-diffusion.github.io/
+        layer_attns = True,
+        layer_attns_depth = 1,
+        layer_attns_add_text_cond = True,   # whether to condition the self-attention blocks with the text embeddings, as described in Appendix D.3.1
+        attend_at_middle = True,            # whether to have a layer of attention at the bottleneck (can turn off for higher resolution in cascading DDPM, before bringing in efficient attention)
+        layer_cross_attns = True,
+        use_linear_attn = False,
+        use_linear_cross_attn = False,
+        cond_on_text = True,
+        max_text_len = 256,
+        init_dim = None,
+        resnet_groups = 8,
+        init_conv_kernel_size = 7,          # kernel size of initial conv, if not using cross embed
+        init_cross_embed = True,
+        init_cross_embed_kernel_sizes = (3, 7, 15),
+        cross_embed_downsample = False,
+        cross_embed_downsample_kernel_sizes = (2, 4),
+        attn_pool_text = True,
+        attn_pool_num_latents = 32,
+        dropout = 0.,
+        memory_efficient = False,
+        init_conv_to_final_conv_residual = False,
+        use_global_context_attn = True,
+        scale_skip_connection = True,
+        final_resnet_block = True,
+        final_conv_kernel_size = 3,
+        cosine_sim_attn = False,
+        self_cond = False,
+        combine_upsample_fmaps = False,      # combine feature maps from all upsample blocks, used in unet squared successfully
+        pixel_shuffle_upsample = True        # may address checkboard artifacts
+    ):
+        super().__init__()
+        # guide researchers
+        assert attn_heads > 1, 'you need to have more than 1 attention head, ideally at least 4 or 8'
+        if dim < 128:
+            print_once('The base dimension of your u-net should ideally be no smaller than 128, as recommended by a professional DDPM trainer https://nonint.com/2022/05/04/friends-dont-let-friends-train-small-diffusion-models/')
+        # save locals to take care of some hyperparameters for cascading DDPM
+        self._locals = locals()
+        self._locals.pop('self', None)
+        self._locals.pop('__class__', None)
+        # determine dimensions
+        self.channels = channels
+        self.channels_out = default(channels_out, channels)
+        # (1) in cascading diffusion, one concats the low resolution image, blurred, for conditioning the higher resolution synthesis
+        # (2) in self conditioning, one appends the predict x0 (x_start)
+        init_channels = channels * (1 + int(lowres_cond) + int(self_cond))
+        init_dim = default(init_dim, dim)
+        self.self_cond = self_cond
+        # optional image conditioning
+        self.has_cond_image = cond_images_channels > 0
+        self.cond_images_channels = cond_images_channels
+        init_channels += cond_images_channels
+        # initial convolution
+        self.init_conv = CrossEmbedLayer(init_channels, dim_out = init_dim, kernel_sizes = init_cross_embed_kernel_sizes, stride = 1) if init_cross_embed else nn.Conv2d(init_channels, init_dim, init_conv_kernel_size, padding = init_conv_kernel_size // 2)
+        dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        # time conditioning
+        cond_dim = default(cond_dim, dim)
+        time_cond_dim = dim * 4 * (2 if lowres_cond else 1)
+        # embedding time for log(snr) noise from continuous version
+        sinu_pos_emb = LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim)
+        sinu_pos_emb_input_dim = learned_sinu_pos_emb_dim + 1
+        self.to_time_hiddens = nn.Sequential(
+            sinu_pos_emb,
+            nn.Linear(sinu_pos_emb_input_dim, time_cond_dim),
+            nn.SiLU()
+        )
+        self.to_time_cond = nn.Sequential(
+            nn.Linear(time_cond_dim, time_cond_dim)
+        )
+        # project to time tokens as well as time hiddens
+        self.to_time_tokens = nn.Sequential(
+            nn.Linear(time_cond_dim, cond_dim * num_time_tokens),
+            Rearrange('b (r d) -> b r d', r = num_time_tokens)
+        )
+        # low res aug noise conditioning
+        self.lowres_cond = lowres_cond
+        if lowres_cond:
+            self.to_lowres_time_hiddens = nn.Sequential(
+                LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim),
+                nn.Linear(learned_sinu_pos_emb_dim + 1, time_cond_dim),
+                nn.SiLU()
+            )
+            self.to_lowres_time_cond = nn.Sequential(
+                nn.Linear(time_cond_dim, time_cond_dim)
+            )
+            self.to_lowres_time_tokens = nn.Sequential(
+                nn.Linear(time_cond_dim, cond_dim * num_time_tokens),
+                Rearrange('b (r d) -> b r d', r = num_time_tokens)
+            )
+        # normalizations
+        self.norm_cond = nn.LayerNorm(cond_dim)
+        # text encoding conditioning (optional)
+        self.text_to_cond = None
+        if cond_on_text:
+            assert exists(text_embed_dim), 'text_embed_dim must be given to the unet if cond_on_text is True'
+            self.text_to_cond = nn.Linear(text_embed_dim, cond_dim)
+        # finer control over whether to condition on text encodings
+        self.cond_on_text = cond_on_text
+        # attention pooling
+        self.attn_pool = PerceiverResampler(dim = cond_dim, depth = 2, dim_head = attn_dim_head, heads = attn_heads, num_latents = attn_pool_num_latents, cosine_sim_attn = cosine_sim_attn) if attn_pool_text else None
+        # for classifier free guidance
+        self.max_text_len = max_text_len
+        self.null_text_embed = nn.Parameter(torch.randn(1, max_text_len, cond_dim))
+        self.null_text_hidden = nn.Parameter(torch.randn(1, time_cond_dim))
+        # for non-attention based text conditioning at all points in the network where time is also conditioned
+        self.to_text_non_attn_cond = None
+        if cond_on_text:
+            self.to_text_non_attn_cond = nn.Sequential(
+                nn.LayerNorm(cond_dim),
+                nn.Linear(cond_dim, time_cond_dim),
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, time_cond_dim)
+            )
+        # attention related params
+        attn_kwargs = dict(heads = attn_heads, dim_head = attn_dim_head, cosine_sim_attn = cosine_sim_attn)
+        num_layers = len(in_out)
+        # resnet block klass
+        num_resnet_blocks = cast_tuple(num_resnet_blocks, num_layers)
+        resnet_groups = cast_tuple(resnet_groups, num_layers)
+        resnet_klass = partial(ResnetBlock, **attn_kwargs)
+        layer_attns = cast_tuple(layer_attns, num_layers)
+        layer_attns_depth = cast_tuple(layer_attns_depth, num_layers)
+        layer_cross_attns = cast_tuple(layer_cross_attns, num_layers)
+        use_linear_attn = cast_tuple(use_linear_attn, num_layers)
+        use_linear_cross_attn = cast_tuple(use_linear_cross_attn, num_layers)
+        assert all([layers == num_layers for layers in list(map(len, (resnet_groups, layer_attns, layer_cross_attns)))])
+        # downsample klass
+        downsample_klass = Downsample
+        if cross_embed_downsample:
+            downsample_klass = partial(CrossEmbedLayer, kernel_sizes = cross_embed_downsample_kernel_sizes)
+        # initial resnet block (for memory efficient unet)
+        self.init_resnet_block = resnet_klass(init_dim, init_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[0], use_gca = use_global_context_attn) if memory_efficient else None
+        # scale for resnet skip connections
+        self.skip_connect_scale = 1. if not scale_skip_connection else (2 ** -0.5)
+        # layers
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+        num_resolutions = len(in_out)
+        layer_params = [num_resnet_blocks, resnet_groups, layer_attns, layer_attns_depth, layer_cross_attns, use_linear_attn, use_linear_cross_attn]
+        reversed_layer_params = list(map(reversed, layer_params))
+        # downsampling layers
+        skip_connect_dims = [] # keep track of skip connection dimensions
+        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn, layer_attn_depth, layer_cross_attn, layer_use_linear_attn, layer_use_linear_cross_attn) in enumerate(zip(in_out, *layer_params)):
+            is_last = ind >= (num_resolutions - 1)
+            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None
+            if layer_attn:
+                transformer_block_klass = TransformerBlock
+            elif layer_use_linear_attn:
+                transformer_block_klass = LinearAttentionTransformerBlock
+            else:
+                transformer_block_klass = Identity
+            current_dim = dim_in
+            # whether to pre-downsample, from memory efficient unet
+            pre_downsample = None
+            if memory_efficient:
+                pre_downsample = downsample_klass(dim_in, dim_out)
+                current_dim = dim_out
+            skip_connect_dims.append(current_dim)
+            # whether to do post-downsample, for non-memory efficient unet
+            post_downsample = None
+            if not memory_efficient:
+                post_downsample = downsample_klass(current_dim, dim_out) if not is_last else Parallel(nn.Conv2d(dim_in, dim_out, 3, padding = 1), nn.Conv2d(dim_in, dim_out, 1))
+            self.downs.append(nn.ModuleList([
+                pre_downsample,
+                resnet_klass(current_dim, current_dim, cond_dim = layer_cond_dim, linear_attn = layer_use_linear_cross_attn, time_cond_dim = time_cond_dim, groups = groups),
+                nn.ModuleList([ResnetBlock(current_dim, current_dim, time_cond_dim = time_cond_dim, groups = groups, use_gca = use_global_context_attn) for _ in range(layer_num_resnet_blocks)]),
+                transformer_block_klass(dim = current_dim, depth = layer_attn_depth, ff_mult = ff_mult, context_dim = cond_dim, **attn_kwargs),
+                post_downsample
+            ]))
+        # middle layers
+        mid_dim = dims[-1]
+        self.mid_block1 = ResnetBlock(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[-1])
+        self.mid_attn = EinopsToAndFrom('b c h w', 'b (h w) c', Residual(Attention(mid_dim, **attn_kwargs))) if attend_at_middle else None
+        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[-1])
+        # upsample klass
+        upsample_klass = Upsample if not pixel_shuffle_upsample else PixelShuffleUpsample
+        # upsampling layers
+        upsample_fmap_dims = []
+        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn, layer_attn_depth, layer_cross_attn, layer_use_linear_attn, layer_use_linear_cross_attn) in enumerate(zip(reversed(in_out), *reversed_layer_params)):
+            is_last = ind == (len(in_out) - 1)
+            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None
+            if layer_attn:
+                transformer_block_klass = TransformerBlock
+            elif layer_use_linear_attn:
+                transformer_block_klass = LinearAttentionTransformerBlock
+            else:
+                transformer_block_klass = Identity
+            skip_connect_dim = skip_connect_dims.pop()
+            upsample_fmap_dims.append(dim_out)
+            self.ups.append(nn.ModuleList([
+                resnet_klass(dim_out + skip_connect_dim, dim_out, cond_dim = layer_cond_dim, linear_attn = layer_use_linear_cross_attn, time_cond_dim = time_cond_dim, groups = groups),
+                nn.ModuleList([ResnetBlock(dim_out + skip_connect_dim, dim_out, time_cond_dim = time_cond_dim, groups = groups, use_gca = use_global_context_attn) for _ in range(layer_num_resnet_blocks)]),
+                transformer_block_klass(dim = dim_out, depth = layer_attn_depth, ff_mult = ff_mult, context_dim = cond_dim, **attn_kwargs),
+                upsample_klass(dim_out, dim_in) if not is_last or memory_efficient else Identity()
+            ]))
+        # whether to combine feature maps from all upsample blocks before final resnet block out
+        self.upsample_combiner = UpsampleCombiner(
+            dim = dim,
+            enabled = combine_upsample_fmaps,
+            dim_ins = upsample_fmap_dims,
+            dim_outs = dim
+        )
+        # whether to do a final residual from initial conv to the final resnet block out
+        self.init_conv_to_final_conv_residual = init_conv_to_final_conv_residual
+        final_conv_dim = self.upsample_combiner.dim_out + (dim if init_conv_to_final_conv_residual else 0)
+        # final optional resnet block and convolution out
+        self.final_res_block = ResnetBlock(final_conv_dim, dim, time_cond_dim = time_cond_dim, groups = resnet_groups[0], use_gca = True) if final_resnet_block else None
+        final_conv_dim_in = dim if final_resnet_block else final_conv_dim
+        final_conv_dim_in += (channels if lowres_cond else 0)
+        self.final_conv = nn.Conv2d(final_conv_dim_in, self.channels_out, final_conv_kernel_size, padding = final_conv_kernel_size // 2)
+        zero_init_(self.final_conv)
+    # if the current settings for the unet are not correct
+    # for cascading DDPM, then reinit the unet with the right settings
+    def cast_model_parameters(
+        self,
+        *,
+        lowres_cond,
+        text_embed_dim,
+        channels,
+        channels_out,
+        cond_on_text
+    ):
+        if lowres_cond == self.lowres_cond and \
+            channels == self.channels and \
+            cond_on_text == self.cond_on_text and \
+            text_embed_dim == self._locals['text_embed_dim'] and \
+            channels_out == self.channels_out:
+            return self
+        updated_kwargs = dict(
+            lowres_cond = lowres_cond,
+            text_embed_dim = text_embed_dim,
+            channels = channels,
+            channels_out = channels_out,
+            cond_on_text = cond_on_text
+        )
+        return self.__class__(**{**self._locals, **updated_kwargs})
+    # methods for returning the full unet config as well as its parameter state
+    def to_config_and_state_dict(self):
+        return self._locals, self.state_dict()
+    # class method for rehydrating the unet from its config and state dict
+    @classmethod
+    def from_config_and_state_dict(klass, config, state_dict):
+        unet = klass(**config)
+        unet.load_state_dict(state_dict)
+        return unet
+    # methods for persisting unet to disk
+    def persist_to_file(self, path):
+        path = Path(path)
+        path.parents[0].mkdir(exist_ok = True, parents = True)
+        config, state_dict = self.to_config_and_state_dict()
+        pkg = dict(config = config, state_dict = state_dict)
+        torch.save(pkg, str(path))
+    # class method for rehydrating the unet from file saved with `persist_to_file`
+    @classmethod
+    def hydrate_from_file(klass, path):
+        path = Path(path)
+        assert path.exists()
+        pkg = torch.load(str(path))
+        assert 'config' in pkg and 'state_dict' in pkg
+        config, state_dict = pkg['config'], pkg['state_dict']
+        return Unet.from_config_and_state_dict(config, state_dict)
+    # forward with classifier free guidance
+    def forward_with_cond_scale(
+        self,
+        *args,
+        cond_scale = 1.,
+        **kwargs
+    ):
+        logits = self.forward(*args, **kwargs)
+        if cond_scale == 1:
+            return logits
+        null_logits = self.forward(*args, cond_drop_prob = 1., **kwargs)
+        return null_logits + (logits - null_logits) * cond_scale
+    def forward(
+        self,
+        x,
+        time,
+        *,
+        lowres_cond_img = None,
+        lowres_noise_times = None,
+        text_embeds = None,
+        text_mask = None,
+        cond_images = None,
+        self_cond = None,
+        cond_drop_prob = 0.
+    ):
+        batch_size, device = x.shape[0], x.device
+        # condition on self
+        if self.self_cond:
+            self_cond = default(self_cond, lambda: torch.zeros_like(x))
+            x = torch.cat((x, self_cond), dim = 1)
+        # add low resolution conditioning, if present
+        assert not (self.lowres_cond and not exists(lowres_cond_img)), 'low resolution conditioning image must be present'
+        assert not (self.lowres_cond and not exists(lowres_noise_times)), 'low resolution conditioning noise time must be present'
+        if exists(lowres_cond_img):
+            x = torch.cat((x, lowres_cond_img), dim = 1)
+        # condition on input image
+        assert not (self.has_cond_image ^ exists(cond_images)), 'you either requested to condition on an image on the unet, but the conditioning image is not supplied, or vice versa'
+        if exists(cond_images):
+            assert cond_images.shape[1] == self.cond_images_channels, 'the number of channels on the conditioning image you are passing in does not match what you specified on initialiation of the unet'
+            cond_images = resize_image_to(cond_images, x.shape[-1])
+            x = torch.cat((cond_images, x), dim = 1)
+        # initial convolution
+        x = self.init_conv(x)
+        # init conv residual
+        if self.init_conv_to_final_conv_residual:
+            init_conv_residual = x.clone()
+        # time conditioning
+        time_hiddens = self.to_time_hiddens(time)
+        # derive time tokens
+        time_tokens = self.to_time_tokens(time_hiddens)
+        t = self.to_time_cond(time_hiddens)
+        # add lowres time conditioning to time hiddens
+        # and add lowres time tokens along sequence dimension for attention
+        if self.lowres_cond:
+            lowres_time_hiddens = self.to_lowres_time_hiddens(lowres_noise_times)
+            lowres_time_tokens = self.to_lowres_time_tokens(lowres_time_hiddens)
+            lowres_t = self.to_lowres_time_cond(lowres_time_hiddens)
+            t = t + lowres_t
+            time_tokens = torch.cat((time_tokens, lowres_time_tokens), dim = -2)
+        # text conditioning
+        text_tokens = None
+        if exists(text_embeds) and self.cond_on_text:
+            # conditional dropout
+            text_keep_mask = prob_mask_like((batch_size,), 1 - cond_drop_prob, device = device)
+            text_keep_mask_embed = rearrange(text_keep_mask, 'b -> b 1 1')
+            text_keep_mask_hidden = rearrange(text_keep_mask, 'b -> b 1')
+            # calculate text embeds
+            text_tokens = self.text_to_cond(text_embeds)
+            text_tokens = text_tokens[:, :self.max_text_len]
+            if exists(text_mask):
+                text_mask = text_mask[:, :self.max_text_len]
+            text_tokens_len = text_tokens.shape[1]
+            remainder = self.max_text_len - text_tokens_len
+            if remainder > 0:
+                text_tokens = F.pad(text_tokens, (0, 0, 0, remainder))
+            if exists(text_mask):
+                if remainder > 0:
+                    text_mask = F.pad(text_mask, (0, remainder), value = False)
+                text_mask = rearrange(text_mask, 'b n -> b n 1')
+                text_keep_mask_embed = text_mask & text_keep_mask_embed
+            null_text_embed = self.null_text_embed.to(text_tokens.dtype) # for some reason pytorch AMP not working
+            text_tokens = torch.where(
+                text_keep_mask_embed,
+                text_tokens,
+                null_text_embed
+            )
+            if exists(self.attn_pool):
+                text_tokens = self.attn_pool(text_tokens)
+            # extra non-attention conditioning by projecting and then summing text embeddings to time
+            # termed as text hiddens
+            mean_pooled_text_tokens = text_tokens.mean(dim = -2)
+            text_hiddens = self.to_text_non_attn_cond(mean_pooled_text_tokens)
+            null_text_hidden = self.null_text_hidden.to(t.dtype)
+            text_hiddens = torch.where(
+                text_keep_mask_hidden,
+                text_hiddens,
+                null_text_hidden
+            )
+            t = t + text_hiddens
+        # main conditioning tokens (c)
+        c = time_tokens if not exists(text_tokens) else torch.cat((time_tokens, text_tokens), dim = -2)
+        # normalize conditioning tokens
+        c = self.norm_cond(c)
+        # initial resnet block (for memory efficient unet)
+        if exists(self.init_resnet_block):
+            x = self.init_resnet_block(x, t)
+        # go through the layers of the unet, down and up
+        hiddens = []
+        for pre_downsample, init_block, resnet_blocks, attn_block, post_downsample in self.downs:
+            if exists(pre_downsample):
+                x = pre_downsample(x)
+            x = init_block(x, t, c)
+            for resnet_block in resnet_blocks:
+                x = resnet_block(x, t)
+                hiddens.append(x)
+            x = attn_block(x, c)
+            hiddens.append(x)
+            if exists(post_downsample):
+                x = post_downsample(x)
+        x = self.mid_block1(x, t, c)
+        if exists(self.mid_attn):
+            x = self.mid_attn(x)
+        x = self.mid_block2(x, t, c)
+        add_skip_connection = lambda x: torch.cat((x, hiddens.pop() * self.skip_connect_scale), dim = 1)
+        up_hiddens = []
+        for init_block, resnet_blocks, attn_block, upsample in self.ups:
+            x = add_skip_connection(x)
+            x = init_block(x, t, c)
+            for resnet_block in resnet_blocks:
+                x = add_skip_connection(x)
+                x = resnet_block(x, t)
+            x = attn_block(x, c)
+            up_hiddens.append(x.contiguous())
+            x = upsample(x)
+        # whether to combine all feature maps from upsample blocks
+        x = self.upsample_combiner(x, up_hiddens)
+        # final top-most residual if needed
+        if self.init_conv_to_final_conv_residual:
+            x = torch.cat((x, init_conv_residual), dim = 1)
+        if exists(self.final_res_block):
+            x = self.final_res_block(x, t)
+        if exists(lowres_cond_img):
+            x = torch.cat((x, lowres_cond_img), dim = 1)
+        return self.final_conv(x)
+# null unet
+class NullUnet(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.lowres_cond = False
+        self.dummy_parameter = nn.Parameter(torch.tensor([0.]))
+    def cast_model_parameters(self, *args, **kwargs):
+        return self
+    def forward(self, x, *args, **kwargs):
+        return x
+# predefined unets, with configs lining up with hyperparameters in appendix of paper
+class BaseUnet64(Unet):
+    def __init__(self, *args, **kwargs):
+        default_kwargs = dict(
+            dim = 512,
+            dim_mults = (1, 2, 3, 4),
+            num_resnet_blocks = 3,
+            layer_attns = (False, True, True, True),
+            layer_cross_attns = (False, True, True, True),
+            attn_heads = 8,
+            ff_mult = 2.,
+            memory_efficient = False
+        )
+        super().__init__(*args, **{**default_kwargs, **kwargs})
+class SRUnet256(Unet):
+    def __init__(self, *args, **kwargs):
+        default_kwargs = dict(
+            dim = 128,
+            dim_mults = (1, 2, 4, 8),
+            num_resnet_blocks = (2, 4, 8, 8),
+            layer_attns = (False, False, False, True),
+            layer_cross_attns = (False, False, False, True),
+            attn_heads = 8,
+            ff_mult = 2.,
+            memory_efficient = True
+        )
+        super().__init__(*args, **{**default_kwargs, **kwargs})
+class SRUnet1024(Unet):
+    def __init__(self, *args, **kwargs):
+        default_kwargs = dict(
+            dim = 128,
+            dim_mults = (1, 2, 4, 8),
+            num_resnet_blocks = (2, 4, 8, 8),
+            layer_attns = False,
+            layer_cross_attns = (False, False, False, True),
+            attn_heads = 8,
+            ff_mult = 2.,
+            memory_efficient = True
+        )
+        super().__init__(*args, **{**default_kwargs, **kwargs})
+# main imagen ddpm class, which is a cascading DDPM from Ho et al.
+class Imagen(nn.Module):
+    def __init__(
+        self,
+        unets,
+        *,
+        image_sizes,                                # for cascading ddpm, image size at each stage
+        text_encoder_name = DEFAULT_T5_NAME,
+        text_embed_dim = None,
+        channels = 3,
+        timesteps = 1000,
+        sample_timesteps=100,
+        cond_drop_prob = 0.1,
+        loss_type = 'l2',
+        noise_schedules = 'cosine',
+        pred_objectives = 'noise',
+        random_crop_sizes = None,
+        lowres_noise_schedule = 'linear',
+        lowres_sample_noise_level = 0.2,            # in the paper, they present a new trick where they noise the lowres conditioning image, and at sample time, fix it to a certain level (0.1 or 0.3) - the unets are also made to be conditioned on this noise level
+        per_sample_random_aug_noise_level = False,  # unclear when conditioning on augmentation noise level, whether each batch element receives a random aug noise value - turning off due to @marunine's find
+        condition_on_text = True,
+        auto_normalize_img = True,                  # whether to take care of normalizing the image from [0, 1] to [-1, 1] and back automatically - you can turn this off if you want to pass in the [-1, 1] ranged image yourself from the dataloader
+        p2_loss_weight_gamma = 0.5,                 # p2 loss weight, from https://arxiv.org/abs/2204.00227 - 0 is equivalent to weight of 1 across time
+        p2_loss_weight_k = 1,
+        dynamic_thresholding = True,
+        dynamic_thresholding_percentile = 0.95,     # unsure what this was based on perusal of paper
+        only_train_unet_number = None
+    ):
+        super().__init__()
+        # loss
+        if loss_type == 'l1':
+            loss_fn = F.l1_loss
+        elif loss_type == 'l2':
+            loss_fn = F.mse_loss
+        elif loss_type == 'huber':
+            loss_fn = F.smooth_l1_loss
+        else:
+            raise NotImplementedError()
+        self.loss_type = loss_type
+        self.loss_fn = loss_fn
+        # conditioning hparams
+        self.condition_on_text = condition_on_text
+        self.unconditional = not condition_on_text
+        # channels
+        self.channels = channels
+        # automatically take care of ensuring that first unet is unconditional
+        # while the rest of the unets are conditioned on the low resolution image produced by previous unet
+        unets = cast_tuple(unets)
+        num_unets = len(unets)
+        # determine noise schedules per unet
+        timesteps = cast_tuple(timesteps, num_unets)
+        sample_timesteps = cast_tuple(sample_timesteps, num_unets)
+        # make sure noise schedule defaults to 'cosine', 'cosine', and then 'linear' for rest of super-resoluting unets
+        noise_schedules = cast_tuple(noise_schedules)
+        noise_schedules = pad_tuple_to_length(noise_schedules, 2, 'cosine')
+        noise_schedules = pad_tuple_to_length(noise_schedules, num_unets, 'linear')
+        # construct noise schedulers
+        noise_scheduler_klass = GaussianDiffusionContinuousTimes
+        self.noise_schedulers = nn.ModuleList([])
+        for timestep, noise_schedule in zip(timesteps, noise_schedules):
+            noise_scheduler = noise_scheduler_klass(noise_schedule = noise_schedule, timesteps = timestep)
+            self.noise_schedulers.append(noise_scheduler)
+        self.noise_schedulers_sample = nn.ModuleList([])
+        for sample_timestep, noise_schedule in zip(sample_timesteps, noise_schedules):
+            noise_scheduler_sample = noise_scheduler_klass(noise_schedule=noise_schedule, timesteps=sample_timestep)
+            self.noise_schedulers_sample.append(noise_scheduler_sample)
+        # randomly cropping for upsampler training
+        self.random_crop_sizes = cast_tuple(random_crop_sizes, num_unets)
+        assert not exists(first(self.random_crop_sizes)), 'you should not need to randomly crop image during training for base unet, only for upsamplers - so pass in `random_crop_sizes = (None, 128, 256)` as example'
+        # lowres augmentation noise schedule
+        self.lowres_noise_schedule = GaussianDiffusionContinuousTimes(noise_schedule = lowres_noise_schedule)
+        # ddpm objectives - predicting noise by default
+        self.pred_objectives = cast_tuple(pred_objectives, num_unets)
+        # get text encoder
+        self.text_encoder_name = text_encoder_name
+        self.text_embed_dim = default(text_embed_dim, lambda: get_encoded_dim(text_encoder_name))
+        self.encode_text = partial(t5_encode_text, name = text_encoder_name)
+        # construct unets
+        self.unets = nn.ModuleList([])
+        self.unet_being_trained_index = -1 # keeps track of which unet is being trained at the moment
+        self.only_train_unet_number = only_train_unet_number
+        for ind, one_unet in enumerate(unets):
+            assert isinstance(one_unet, (Unet, Unet3D, NullUnet))
+            is_first = ind == 0
+            one_unet = one_unet.cast_model_parameters(
+                lowres_cond = not is_first,
+                cond_on_text = self.condition_on_text,
+                text_embed_dim = self.text_embed_dim if self.condition_on_text else None,
+                channels = self.channels,
+                channels_out = self.channels
+            )
+            self.unets.append(one_unet)
+        # unet image sizes
+        image_sizes = cast_tuple(image_sizes)
+        self.image_sizes = image_sizes
+        assert num_unets == len(image_sizes), f'you did not supply the correct number of u-nets ({len(unets)}) for resolutions {image_sizes}'
+        self.sample_channels = cast_tuple(self.channels, num_unets)
+        # determine whether we are training on images or video
+        is_video = any([isinstance(unet, Unet3D) for unet in self.unets])
+        self.is_video = is_video
+        self.right_pad_dims_to_datatype = partial(rearrange, pattern = ('b -> b 1 1 1' if not is_video else 'b -> b 1 1 1 1'))
+        self.resize_to = resize_video_to if is_video else resize_image_to
+        # cascading ddpm related stuff
+        lowres_conditions = tuple(map(lambda t: t.lowres_cond, self.unets))
+        assert lowres_conditions == (False, *((True,) * (num_unets - 1))), 'the first unet must be unconditioned (by low resolution image), and the rest of the unets must have `lowres_cond` set to True'
+        self.lowres_sample_noise_level = lowres_sample_noise_level
+        self.per_sample_random_aug_noise_level = per_sample_random_aug_noise_level
+        # classifier free guidance
+        self.cond_drop_prob = cond_drop_prob
+        self.can_classifier_guidance = cond_drop_prob > 0.
+        # normalize and unnormalize image functions
+        self.normalize_img = normalize_neg_one_to_one if auto_normalize_img else identity
+        self.unnormalize_img = unnormalize_zero_to_one if auto_normalize_img else identity
+        self.input_image_range = (0. if auto_normalize_img else -1., 1.)
+        # dynamic thresholding
+        self.dynamic_thresholding = cast_tuple(dynamic_thresholding, num_unets)
+        self.dynamic_thresholding_percentile = dynamic_thresholding_percentile
+        # p2 loss weight
+        self.p2_loss_weight_k = p2_loss_weight_k
+        self.p2_loss_weight_gamma = cast_tuple(p2_loss_weight_gamma, num_unets)
+        assert all([(gamma_value <= 2) for gamma_value in self.p2_loss_weight_gamma]), 'in paper, they noticed any gamma greater than 2 is harmful'
+        # one temp parameter for keeping track of device
+        self.register_buffer('_temp', torch.tensor([0.]), persistent = False)
+        # default to device of unets passed in
+        self.to(next(self.unets.parameters()).device)
+    def force_unconditional_(self):
+        self.condition_on_text = False
+        self.unconditional = True
+        for unet in self.unets:
+            unet.cond_on_text = False
+    @property
+    def device(self):
+        return self._temp.device
+    def get_unet(self, unet_number):
+        assert 0 < unet_number <= len(self.unets)
+        index = unet_number - 1
+        if isinstance(self.unets, nn.ModuleList):
+            unets_list = [unet for unet in self.unets]
+            delattr(self, 'unets')
+            self.unets = unets_list
+        if index != self.unet_being_trained_index:
+            for unet_index, unet in enumerate(self.unets):
+                unet.to(self.device if unet_index == index else 'cpu')
+        self.unet_being_trained_index = index
+        return self.unets[index]
+    def reset_unets_all_one_device(self, device = None):
+        device = default(device, self.device)
+        self.unets = nn.ModuleList([*self.unets])
+        self.unets.to(device)
+        self.unet_being_trained_index = -1
+    @contextmanager
+    def one_unet_in_gpu(self, unet_number = None, unet = None):
+        assert exists(unet_number) ^ exists(unet)
+        if exists(unet_number):
+            unet = self.unets[unet_number - 1]
+        devices = [module_device(unet) for unet in self.unets]
+        self.unets.cpu()
+        unet.to(self.device)
+        yield
+        for unet, device in zip(self.unets, devices):
+            unet.to(device)
+    # overriding state dict functions
+    def state_dict(self, *args, **kwargs):
+        self.reset_unets_all_one_device()
+        return super().state_dict(*args, **kwargs)
+    def load_state_dict(self, *args, **kwargs):
+        self.reset_unets_all_one_device()
+        return super().load_state_dict(*args, **kwargs)
+    # gaussian diffusion methods
+    def p_mean_variance(
+        self,
+        unet,
+        x,
+        t,
+        *,
+        noise_scheduler,
+        text_embeds = None,
+        text_mask = None,
+        cond_images = None,
+        lowres_cond_img = None,
+        self_cond = None,
+        lowres_noise_times = None,
+        cond_scale = 1.,
+        model_output = None,
+        t_next = None,
+        pred_objective = 'noise',
+        dynamic_threshold = True
+    ):
+        assert not (cond_scale != 1. and not self.can_classifier_guidance), 'imagen was not trained with conditional dropout, and thus one cannot use classifier free guidance (cond_scale anything other than 1)'
+        pred = default(model_output, lambda: unet.forward_with_cond_scale(x, noise_scheduler.get_condition(t), text_embeds = text_embeds, text_mask = text_mask, cond_images = cond_images, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img, self_cond = self_cond, lowres_noise_times = self.lowres_noise_schedule.get_condition(lowres_noise_times)))
+        if pred_objective == 'noise':
+            x_start = noise_scheduler.predict_start_from_noise(x, t = t, noise = pred)
+        elif pred_objective == 'x_start':
+            x_start = pred
+        else:
+            raise ValueError(f'unknown objective {pred_objective}')
+        if dynamic_threshold:
+            # following pseudocode in appendix
+            # s is the dynamic threshold, determined by percentile of absolute values of reconstructed sample per batch element
+            s = torch.quantile(
+                rearrange(x_start, 'b ... -> b (...)').abs(),
+                self.dynamic_thresholding_percentile,
+                dim = -1
+            )
+            s.clamp_(min = 1.)
+            s = right_pad_dims_to(x_start, s)
+            x_start = x_start.clamp(-s, s) / s
+        else:
+            x_start.clamp_(-1., 1.)
+        mean_and_variance = noise_scheduler.q_posterior(x_start = x_start, x_t = x, t = t, t_next = t_next)
+        return mean_and_variance, x_start
+    @torch.no_grad()
+    def p_sample(
+        self,
+        unet,
+        x,
+        t,
+        *,
+        noise_scheduler,
+        t_next = None,
+        text_embeds = None,
+        text_mask = None,
+        cond_images = None,
+        cond_scale = 1.,
+        self_cond = None,
+        lowres_cond_img = None,
+        lowres_noise_times = None,
+        pred_objective = 'noise',
+        dynamic_threshold = True
+    ):
+        b, *_, device = *x.shape, x.device
+        (model_mean, _, model_log_variance), x_start = self.p_mean_variance(unet, x = x, t = t, t_next = t_next, noise_scheduler = noise_scheduler, text_embeds = text_embeds, text_mask = text_mask, cond_images = cond_images, cond_scale = cond_scale, lowres_cond_img = lowres_cond_img, self_cond = self_cond, lowres_noise_times = lowres_noise_times, pred_objective = pred_objective, dynamic_threshold = dynamic_threshold)
+        noise = torch.randn_like(x)
+        # no noise when t == 0
+        is_last_sampling_timestep = (t_next == 0) if isinstance(noise_scheduler, GaussianDiffusionContinuousTimes) else (t == 0)
+        nonzero_mask = (1 - is_last_sampling_timestep.float()).reshape(b, *((1,) * (len(x.shape) - 1)))
+        pred = model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+        return pred, x_start
+    @torch.no_grad()
+    def p_sample_loop(
+        self,
+        unet,
+        shape,
+        *,
+        noise_scheduler,
+        lowres_cond_img = None,
+        lowres_noise_times = None,
+        text_embeds = None,
+        text_mask = None,
+        cond_images = None,
+        inpaint_images = None,
+        inpaint_masks = None,
+        inpaint_resample_times = 5,
+        init_images = None,
+        skip_steps = None,
+        cond_scale = 1,
+        pred_objective = 'noise',
+        dynamic_threshold = True,
+        use_tqdm = True
+    ):
+        device = self.device
+        batch = shape[0]
+        img = torch.randn(shape, device = device)
+        # for initialization with an image or video
+        if exists(init_images):
+            img += init_images
+        # keep track of x0, for self conditioning
+        x_start = None
+        # prepare inpainting
+        has_inpainting = exists(inpaint_images) and exists(inpaint_masks)
+        resample_times = inpaint_resample_times if has_inpainting else 1
+        if has_inpainting:
+            inpaint_images = self.normalize_img(inpaint_images)
+            inpaint_images = self.resize_to(inpaint_images, shape[-1])
+            inpaint_masks = self.resize_to(rearrange(inpaint_masks, 'b ... -> b 1 ...').float(), shape[-1]).bool()
+        # time
+        timesteps = noise_scheduler.get_sampling_timesteps(batch, device = device)
+        # whether to skip any steps
+        skip_steps = default(skip_steps, 0)
+        timesteps = timesteps[skip_steps:]
+        for times, times_next in tqdm(timesteps, desc = 'sampling loop time step', total = len(timesteps), disable = not use_tqdm):
+            is_last_timestep = times_next == 0
+            for r in reversed(range(resample_times)):
+                is_last_resample_step = r == 0
+                if has_inpainting:
+                    noised_inpaint_images, _ = noise_scheduler.q_sample(inpaint_images, t = times)
+                    img = img * ~inpaint_masks + noised_inpaint_images * inpaint_masks
+                self_cond = x_start if unet.self_cond else None
+                img, x_start = self.p_sample(
+                    unet,
+                    img,
+                    times,
+                    t_next = times_next,
+                    text_embeds = text_embeds,
+                    text_mask = text_mask,
+                    cond_images = cond_images,
+                    cond_scale = cond_scale,
+                    self_cond = self_cond,
+                    lowres_cond_img = lowres_cond_img,
+                    lowres_noise_times = lowres_noise_times,
+                    noise_scheduler = noise_scheduler,
+                    pred_objective = pred_objective,
+                    dynamic_threshold = dynamic_threshold
+                )
+                if has_inpainting and not (is_last_resample_step or torch.all(is_last_timestep)):
+                    renoised_img = noise_scheduler.q_sample_from_to(img, times_next, times)
+                    img = torch.where(
+                        self.right_pad_dims_to_datatype(is_last_timestep),
+                        img,
+                        renoised_img
+                    )
+        img.clamp_(-1., 1.)
+        # final inpainting
+        if has_inpainting:
+            img = img * ~inpaint_masks + inpaint_images * inpaint_masks
+        unnormalize_img = self.unnormalize_img(img)
+        return unnormalize_img
+    @torch.no_grad()
+    @eval_decorator
+    def sample(
+        self,
+        texts: List[str] = None,
+        text_masks = None,
+        text_embeds = None,
+        video_frames = None,
+        cond_images = None,
+        inpaint_images = None,
+        inpaint_masks = None,
+        inpaint_resample_times = 5,
+        init_images = None,
+        skip_steps = None,
+        batch_size = 1,
+        cond_scale = 1.,
+        lowres_sample_noise_level = None,
+        start_at_unet_number = 1,
+        start_image_or_video = None,
+        stop_at_unet_number = None,
+        return_all_unet_outputs = False,
+        return_pil_images = False,
+        device = None,
+        use_tqdm = True
+    ):
+        device = default(device, self.device)
+        self.reset_unets_all_one_device(device = device)
+        cond_images = maybe(cast_uint8_images_to_float)(cond_images)
+        if exists(texts) and not exists(text_embeds) and not self.unconditional:
+            assert all([*map(len, texts)]), 'text cannot be empty'
+            with autocast(enabled = False):
+                text_embeds, text_masks = self.encode_text(texts, return_attn_mask = True)
+            text_embeds, text_masks = map(lambda t: t.to(device), (text_embeds, text_masks))
+        if not self.unconditional:
+            assert exists(text_embeds), 'text must be passed in if the network was not trained without text `condition_on_text` must be set to `False` when training'
+            text_masks = default(text_masks, lambda: torch.any(text_embeds != 0., dim = -1))
+            batch_size = text_embeds.shape[0]
+        if exists(inpaint_images):
+            if self.unconditional:
+                if batch_size == 1: # assume researcher wants to broadcast along inpainted images
+                    batch_size = inpaint_images.shape[0]
+            assert inpaint_images.shape[0] == batch_size, 'number of inpainting images must be equal to the specified batch size on sample `sample(batch_size=<int>)``'
+            assert not (self.condition_on_text and inpaint_images.shape[0] != text_embeds.shape[0]), 'number of inpainting images must be equal to the number of text to be conditioned on'
+        assert not (self.condition_on_text and not exists(text_embeds)), 'text or text encodings must be passed into imagen if specified'
+        assert not (not self.condition_on_text and exists(text_embeds)), 'imagen specified not to be conditioned on text, yet it is presented'
+        assert not (exists(text_embeds) and text_embeds.shape[-1] != self.text_embed_dim), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'
+        assert not (exists(inpaint_images) ^ exists(inpaint_masks)),  'inpaint images and masks must be both passed in to do inpainting'
+        outputs = []
+        is_cuda = next(self.parameters()).is_cuda
+        device = next(self.parameters()).device
+        lowres_sample_noise_level = default(lowres_sample_noise_level, self.lowres_sample_noise_level)
+        num_unets = len(self.unets)
+        # condition scaling
+        cond_scale = cast_tuple(cond_scale, num_unets)
+        # add frame dimension for video
+        assert not (self.is_video and not exists(video_frames)), 'video_frames must be passed in on sample time if training on video'
+        frame_dims = (video_frames,) if self.is_video else tuple()
+        # for initial image and skipping steps
+        init_images = cast_tuple(init_images, num_unets)
+        init_images = [maybe(self.normalize_img)(init_image) for init_image in init_images]
+        skip_steps = cast_tuple(skip_steps, num_unets)
+        # handle starting at a unet greater than 1, for training only-upscaler training
+        if start_at_unet_number > 1:
+            assert start_at_unet_number <= num_unets, 'must start a unet that is less than the total number of unets'
+            assert not exists(stop_at_unet_number) or start_at_unet_number <= stop_at_unet_number
+            assert exists(start_image_or_video), 'starting image or video must be supplied if only doing upscaling'
+            prev_image_size = self.image_sizes[start_at_unet_number - 2]
+            img = self.resize_to(start_image_or_video, prev_image_size)
+        # go through each unet in cascade
+        for unet_number, unet, channel, image_size, noise_scheduler, pred_objective, dynamic_threshold, unet_cond_scale, unet_init_images, unet_skip_steps in tqdm(zip(range(1, num_unets + 1), self.unets, self.sample_channels, self.image_sizes, self.noise_schedulers_sample, self.pred_objectives, self.dynamic_thresholding, cond_scale, init_images, skip_steps), disable = not use_tqdm):
+            if unet_number < start_at_unet_number:
+                continue
+            assert not isinstance(unet, NullUnet), 'one cannot sample from null / placeholder unets'
+            context = self.one_unet_in_gpu(unet = unet) if is_cuda else nullcontext()
+            with context:
+                lowres_cond_img = lowres_noise_times = None
+                shape = (batch_size, channel, *frame_dims, *image_size)
+                if unet.lowres_cond:
+                    lowres_noise_times = self.lowres_noise_schedule.get_times(batch_size, lowres_sample_noise_level, device = device)
+                    lowres_cond_img = self.resize_to(img, image_size)
+                    lowres_cond_img = self.normalize_img(lowres_cond_img)
+                    lowres_cond_img, _ = self.lowres_noise_schedule.q_sample(x_start = lowres_cond_img, t = lowres_noise_times, noise = torch.randn_like(lowres_cond_img))
+                if exists(unet_init_images):
+                    unet_init_images = self.resize_to(unet_init_images, image_size)
+                shape = (batch_size, self.channels, *frame_dims, *image_size)
+                img = self.p_sample_loop(
+                    unet,
+                    shape,
+                    text_embeds = text_embeds,
+                    text_mask = text_masks,
+                    cond_images = cond_images,
+                    inpaint_images = inpaint_images,
+                    inpaint_masks = inpaint_masks,
+                    inpaint_resample_times = inpaint_resample_times,
+                    init_images = unet_init_images,
+                    skip_steps = unet_skip_steps,
+                    cond_scale = unet_cond_scale,
+                    lowres_cond_img = lowres_cond_img,
+                    lowres_noise_times = lowres_noise_times,
+                    noise_scheduler = noise_scheduler,
+                    pred_objective = pred_objective,
+                    dynamic_threshold = dynamic_threshold,
+                    use_tqdm = use_tqdm
+                )
+                outputs.append(img)
+            if exists(stop_at_unet_number) and stop_at_unet_number == unet_number:
+                break
+        output_index = -1 if not return_all_unet_outputs else slice(None) # either return last unet output or all unet outputs
+        if not return_pil_images:
+            return outputs[output_index]
+        if not return_all_unet_outputs:
+            outputs = outputs[-1:]
+        assert not self.is_video, 'converting sampled video tensor to video file is not supported yet'
+        pil_images = list(map(lambda img: list(map(T.ToPILImage(), img.unbind(dim = 0))), outputs))
+        return pil_images[output_index] # now you have a bunch of pillow images you can just .save(/where/ever/you/want.png)
+    def p_losses(
+        self,
+        unet: Union[Unet, Unet3D, NullUnet, DistributedDataParallel],
+        x_start,
+        times,
+        *,
+        noise_scheduler,
+        lowres_cond_img = None,
+        lowres_aug_times = None,
+        text_embeds = None,
+        text_mask = None,
+        cond_images = None,
+        noise = None,
+        times_next = None,
+        pred_objective = 'noise',
+        p2_loss_weight_gamma = 0.,
+        random_crop_size = None
+    ):
+        is_video = x_start.ndim == 5
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        # normalize to [-1, 1]
+        x_start = self.normalize_img(x_start)
+        lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img)
+        # random cropping during training
+        # for upsamplers
+        if exists(random_crop_size):
+            if is_video:
+                frames = x_start.shape[2]
+                x_start, lowres_cond_img, noise = rearrange_many((x_start, lowres_cond_img, noise), 'b c f h w -> (b f) c h w')
+            aug = K.RandomCrop(random_crop_size, p = 1.)
+            # make sure low res conditioner and image both get augmented the same way
+            # detailed https://kornia.readthedocs.io/en/latest/augmentation.module.html?highlight=randomcrop#kornia.augmentation.RandomCrop
+            x_start = aug(x_start)
+            lowres_cond_img = aug(lowres_cond_img, params = aug._params)
+            noise = aug(noise, params = aug._params)
+            if is_video:
+                x_start, lowres_cond_img, noise = rearrange_many((x_start, lowres_cond_img, noise), '(b f) c h w -> b c f h w', f = frames)
+        # get x_t
+        x_noisy, log_snr = noise_scheduler.q_sample(x_start = x_start, t = times, noise = noise)
+        # also noise the lowres conditioning image
+        # at sample time, they then fix the noise level of 0.1 - 0.3
+        lowres_cond_img_noisy = None
+        if exists(lowres_cond_img):
+            lowres_aug_times = default(lowres_aug_times, times)
+            lowres_cond_img_noisy, _ = self.lowres_noise_schedule.q_sample(x_start = lowres_cond_img, t = lowres_aug_times, noise = torch.randn_like(lowres_cond_img))
+        # time condition
+        noise_cond = noise_scheduler.get_condition(times)
+        # unet kwargs
+        unet_kwargs = dict(
+            text_embeds = text_embeds,
+            text_mask = text_mask,
+            cond_images = cond_images,
+            lowres_noise_times = self.lowres_noise_schedule.get_condition(lowres_aug_times),
+            lowres_cond_img = lowres_cond_img_noisy,
+            cond_drop_prob = self.cond_drop_prob,
+        )
+        # self condition if needed
+        # Because 'unet' can be an instance of DistributedDataParallel coming from the
+        # ImagenTrainer.unet_being_trained when invoking ImagenTrainer.forward(), we need to
+        # access the member 'module' of the wrapped unet instance.
+        self_cond = unet.module.self_cond if isinstance(unet, DistributedDataParallel) else unet.self_cond
+        if self_cond and random() < 0.5:
+            with torch.no_grad():
+                pred = unet.forward(
+                    x_noisy,
+                    noise_cond,
+                    **unet_kwargs
+                ).detach()
+                x_start = noise_scheduler.predict_start_from_noise(x_noisy, t = times, noise = pred) if pred_objective == 'noise' else pred
+                unet_kwargs = {**unet_kwargs, 'self_cond': x_start}
+        # get prediction
+        pred = unet.forward(
+            x_noisy,
+            noise_cond,
+            **unet_kwargs
+        )
+        # prediction objective
+        if pred_objective == 'noise':
+            target = noise
+        elif pred_objective == 'x_start':
+            target = x_start
+        else:
+            raise ValueError(f'unknown objective {pred_objective}')
+        # losses
+        losses = self.loss_fn(pred, target, reduction = 'none')
+        losses = reduce(losses, 'b ... -> b', 'mean')
+        # p2 loss reweighting
+        if p2_loss_weight_gamma > 0:
+            loss_weight = (self.p2_loss_weight_k + log_snr.exp()) ** -p2_loss_weight_gamma
+            losses = losses * loss_weight
+        return losses.mean()
+    def forward(
+        self,
+        images,
+        unet: Union[Unet, Unet3D, NullUnet, DistributedDataParallel] = None,
+        texts: List[str] = None,
+        text_embeds = None,
+        text_masks = None,
+        unet_number = None,
+        cond_images = None
+    ):
+        # assert images.shape[-1] == images.shape[-2], f'the images you pass in must be a square, but received dimensions of {images.shape[2]}, {images.shape[-1]}'
+        assert not (len(self.unets) > 1 and not exists(unet_number)), f'you must specify which unet you want trained, from a range of 1 to {len(self.unets)}, if you are training cascading DDPM (multiple unets)'
+        unet_number = default(unet_number, 1)
+        assert not exists(self.only_train_unet_number) or self.only_train_unet_number == unet_number, 'you can only train on unet #{self.only_train_unet_number}'
+        images = cast_uint8_images_to_float(images)
+        cond_images = maybe(cast_uint8_images_to_float)(cond_images)
+        assert is_float_dtype(images.dtype), f'images tensor needs to be floats but {images.dtype} dtype found instead'
+        unet_index = unet_number - 1
+        unet = default(unet, lambda: self.get_unet(unet_number))
+        assert not isinstance(unet, NullUnet), 'null unet cannot and should not be trained'
+        noise_scheduler      = self.noise_schedulers[unet_index]
+        p2_loss_weight_gamma = self.p2_loss_weight_gamma[unet_index]
+        pred_objective       = self.pred_objectives[unet_index]
+        target_image_size    = self.image_sizes[unet_index]
+        random_crop_size     = self.random_crop_sizes[unet_index]
+        prev_image_size      = self.image_sizes[unet_index - 1] if unet_index > 0 else None
+        b, c, *_, h, w, device, is_video = *images.shape, images.device, images.ndim == 5
+        check_shape(images, 'b c ...', c = self.channels)
+        assert h >= target_image_size[0] and w >= target_image_size[1]
+        frames = images.shape[2] if is_video else None
+        times = noise_scheduler.sample_random_times(b, device = device)
+        if exists(texts) and not exists(text_embeds) and not self.unconditional:
+            assert all([*map(len, texts)]), 'text cannot be empty'
+            assert len(texts) == len(images), 'number of text captions does not match up with the number of images given'
+            with autocast(enabled = False):
+                text_embeds, text_masks = self.encode_text(texts, return_attn_mask = True)
+            text_embeds, text_masks = map(lambda t: t.to(images.device), (text_embeds, text_masks))
+        if not self.unconditional:
+            text_masks = default(text_masks, lambda: torch.any(text_embeds != 0., dim = -1))
+        assert not (self.condition_on_text and not exists(text_embeds)), 'text or text encodings must be passed into decoder if specified'
+        assert not (not self.condition_on_text and exists(text_embeds)), 'decoder specified not to be conditioned on text, yet it is presented'
+        assert not (exists(text_embeds) and text_embeds.shape[-1] != self.text_embed_dim), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'
+        lowres_cond_img = lowres_aug_times = None
+        if exists(prev_image_size):
+            lowres_cond_img = self.resize_to(images, prev_image_size, clamp_range = self.input_image_range)
+            lowres_cond_img = self.resize_to(lowres_cond_img, target_image_size, clamp_range = self.input_image_range)
+            if self.per_sample_random_aug_noise_level:
+                lowres_aug_times = self.lowres_noise_schedule.sample_random_times(b, device = device)
+            else:
+                lowres_aug_time = self.lowres_noise_schedule.sample_random_times(1, device = device)
+                lowres_aug_times = repeat(lowres_aug_time, '1 -> b', b = b)
+        images = self.resize_to(images, target_image_size)
+        return self.p_losses(unet, images, times, text_embeds = text_embeds, text_mask = text_masks, cond_images = cond_images, noise_scheduler = noise_scheduler, lowres_cond_img = lowres_cond_img, lowres_aug_times = lowres_aug_times, pred_objective = pred_objective, p2_loss_weight_gamma = p2_loss_weight_gamma, random_crop_size = random_crop_size)

imagen_pytorch/imagen_video/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from imagen_pytorch.imagen_video.imagen_video import Unet3D

imagen_pytorch/imagen_video/imagen_video.py ADDED Viewed

	@@ -0,0 +1,1662 @@

+import math
+import copy
+from typing import List
+from tqdm.auto import tqdm
+from functools import partial, wraps
+from contextlib import contextmanager, nullcontext
+from collections import namedtuple
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+from einops import rearrange, repeat, reduce
+from einops.layers.torch import Rearrange, Reduce
+from einops_exts import rearrange_many, repeat_many, check_shape
+from einops_exts.torch import EinopsToAndFrom
+from imagen_pytorch.t5 import t5_encode_text, get_encoded_dim, DEFAULT_T5_NAME
+# helper functions
+def exists(val):
+    return val is not None
+def identity(t, *args, **kwargs):
+    return t
+def first(arr, d = None):
+    if len(arr) == 0:
+        return d
+    return arr[0]
+def maybe(fn):
+    @wraps(fn)
+    def inner(x):
+        if not exists(x):
+            return x
+        return fn(x)
+    return inner
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def cast_tuple(val, length = None):
+    if isinstance(val, list):
+        val = tuple(val)
+    output = val if isinstance(val, tuple) else ((val,) * default(length, 1))
+    if exists(length):
+        assert len(output) == length
+    return output
+def cast_uint8_images_to_float(images):
+    if not images.dtype == torch.uint8:
+        return images
+    return images / 255
+def module_device(module):
+    return next(module.parameters()).device
+def zero_init_(m):
+    nn.init.zeros_(m.weight)
+    if exists(m.bias):
+        nn.init.zeros_(m.bias)
+def eval_decorator(fn):
+    def inner(model, *args, **kwargs):
+        was_training = model.training
+        model.eval()
+        out = fn(model, *args, **kwargs)
+        model.train(was_training)
+        return out
+    return inner
+def pad_tuple_to_length(t, length, fillvalue = None):
+    remain_length = length - len(t)
+    if remain_length <= 0:
+        return t
+    return (*t, *((fillvalue,) * remain_length))
+# helper classes
+class Identity(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+# tensor helpers
+def log(t, eps: float = 1e-12):
+    return torch.log(t.clamp(min = eps))
+def l2norm(t):
+    return F.normalize(t, dim = -1)
+def right_pad_dims_to(x, t):
+    padding_dims = x.ndim - t.ndim
+    if padding_dims <= 0:
+        return t
+    return t.view(*t.shape, *((1,) * padding_dims))
+def masked_mean(t, *, dim, mask = None):
+    if not exists(mask):
+        return t.mean(dim = dim)
+    denom = mask.sum(dim = dim, keepdim = True)
+    mask = rearrange(mask, 'b n -> b n 1')
+    masked_t = t.masked_fill(~mask, 0.)
+    return masked_t.sum(dim = dim) / denom.clamp(min = 1e-5)
+def resize_video_to(
+    video,
+    target_image_size,
+    clamp_range = None
+):
+    orig_video_size = video.shape[-1]
+    if orig_video_size == target_image_size:
+        return video
+    frames = video.shape[2]
+    video = rearrange(video, 'b c f h w -> (b f) c h w')
+    out = F.interpolate(video, target_image_size, mode = 'nearest')
+    if exists(clamp_range):
+        out = out.clamp(*clamp_range)
+    out = rearrange(out, '(b f) c h w -> b c f h w', f = frames)
+    return out
+# classifier free guidance functions
+def prob_mask_like(shape, prob, device):
+    if prob == 1:
+        return torch.ones(shape, device = device, dtype = torch.bool)
+    elif prob == 0:
+        return torch.zeros(shape, device = device, dtype = torch.bool)
+    else:
+        return torch.zeros(shape, device = device).float().uniform_(0, 1) < prob
+# norms and residuals
+class LayerNorm(nn.Module):
+    def __init__(self, dim, stable = False):
+        super().__init__()
+        self.stable = stable
+        self.g = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        if self.stable:
+            x = x / x.amax(dim = -1, keepdim = True).detach()
+        eps = 1e-5 if x.dtype == torch.float32 else 1e-3
+        var = torch.var(x, dim = -1, unbiased = False, keepdim = True)
+        mean = torch.mean(x, dim = -1, keepdim = True)
+        return (x - mean) * (var + eps).rsqrt() * self.g
+class ChanLayerNorm(nn.Module):
+    def __init__(self, dim, stable = False):
+        super().__init__()
+        self.stable = stable
+        self.g = nn.Parameter(torch.ones(1, dim, 1, 1, 1))
+    def forward(self, x):
+        if self.stable:
+            x = x / x.amax(dim = 1, keepdim = True).detach()
+        eps = 1e-5 if x.dtype == torch.float32 else 1e-3
+        var = torch.var(x, dim = 1, unbiased = False, keepdim = True)
+        mean = torch.mean(x, dim = 1, keepdim = True)
+        return (x - mean) * (var + eps).rsqrt() * self.g
+class Always():
+    def __init__(self, val):
+        self.val = val
+    def __call__(self, *args, **kwargs):
+        return self.val
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) + x
+class Parallel(nn.Module):
+    def __init__(self, *fns):
+        super().__init__()
+        self.fns = nn.ModuleList(fns)
+    def forward(self, x):
+        outputs = [fn(x) for fn in self.fns]
+        return sum(outputs)
+# attention pooling
+class PerceiverAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head = 64,
+        heads = 8,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5 if not cosine_sim_attn else 1
+        self.cosine_sim_attn = cosine_sim_attn
+        self.cosine_sim_scale = 16 if cosine_sim_attn else 1
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = nn.LayerNorm(dim)
+        self.norm_latents = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim, bias = False),
+            nn.LayerNorm(dim)
+        )
+    def forward(self, x, latents, mask = None):
+        x = self.norm(x)
+        latents = self.norm_latents(latents)
+        b, h = x.shape[0], self.heads
+        q = self.to_q(latents)
+        # the paper differs from Perceiver in which they also concat the key / values derived from the latents to be attended to
+        kv_input = torch.cat((x, latents), dim = -2)
+        k, v = self.to_kv(kv_input).chunk(2, dim = -1)
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h = h)
+        q = q * self.scale
+        # cosine sim attention
+        if self.cosine_sim_attn:
+            q, k = map(l2norm, (q, k))
+        # similarities and masking
+        sim = einsum('... i d, ... j d  -> ... i j', q, k) * self.cosine_sim_scale
+        if exists(mask):
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = F.pad(mask, (0, latents.shape[-2]), value = True)
+            mask = rearrange(mask, 'b j -> b 1 1 j')
+            sim = sim.masked_fill(~mask, max_neg_value)
+        # attention
+        attn = sim.softmax(dim = -1)
+        out = einsum('... i j, ... j d -> ... i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)', h = h)
+        return self.to_out(out)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        dim_head = 64,
+        heads = 8,
+        num_latents = 64,
+        num_latents_mean_pooled = 4, # number of latents derived from mean pooled representation of the sequence
+        max_seq_len = 512,
+        ff_mult = 4,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.pos_emb = nn.Embedding(max_seq_len, dim)
+        self.latents = nn.Parameter(torch.randn(num_latents, dim))
+        self.to_latents_from_mean_pooled_seq = None
+        if num_latents_mean_pooled > 0:
+            self.to_latents_from_mean_pooled_seq = nn.Sequential(
+                LayerNorm(dim),
+                nn.Linear(dim, dim * num_latents_mean_pooled),
+                Rearrange('b (n d) -> b n d', n = num_latents_mean_pooled)
+            )
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                PerceiverAttention(dim = dim, dim_head = dim_head, heads = heads, cosine_sim_attn = cosine_sim_attn),
+                FeedForward(dim = dim, mult = ff_mult)
+            ]))
+    def forward(self, x, mask = None):
+        n, device = x.shape[1], x.device
+        pos_emb = self.pos_emb(torch.arange(n, device = device))
+        x_with_pos = x + pos_emb
+        latents = repeat(self.latents, 'n d -> b n d', b = x.shape[0])
+        if exists(self.to_latents_from_mean_pooled_seq):
+            meanpooled_seq = masked_mean(x, dim = 1, mask = torch.ones(x.shape[:2], device = x.device, dtype = torch.bool))
+            meanpooled_latents = self.to_latents_from_mean_pooled_seq(meanpooled_seq)
+            latents = torch.cat((meanpooled_latents, latents), dim = -2)
+        for attn, ff in self.layers:
+            latents = attn(x_with_pos, latents, mask = mask) + latents
+            latents = ff(latents) + latents
+        return latents
+# attention
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        dim_head = 64,
+        heads = 8,
+        causal = False,
+        context_dim = None,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5 if not cosine_sim_attn else 1.
+        self.causal = causal
+        self.cosine_sim_attn = cosine_sim_attn
+        self.cosine_sim_scale = 16 if cosine_sim_attn else 1
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = LayerNorm(dim)
+        self.null_attn_bias = nn.Parameter(torch.randn(heads))
+        self.null_kv = nn.Parameter(torch.randn(2, dim_head))
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, dim_head * 2, bias = False)
+        self.to_context = nn.Sequential(nn.LayerNorm(context_dim), nn.Linear(context_dim, dim_head * 2)) if exists(context_dim) else None
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim, bias = False),
+            LayerNorm(dim)
+        )
+    def forward(self, x, context = None, mask = None, attn_bias = None):
+        b, n, device = *x.shape[:2], x.device
+        x = self.norm(x)
+        q, k, v = (self.to_q(x), *self.to_kv(x).chunk(2, dim = -1))
+        q = rearrange(q, 'b n (h d) -> b h n d', h = self.heads)
+        q = q * self.scale
+        # add null key / value for classifier free guidance in prior net
+        nk, nv = repeat_many(self.null_kv.unbind(dim = -2), 'd -> b 1 d', b = b)
+        k = torch.cat((nk, k), dim = -2)
+        v = torch.cat((nv, v), dim = -2)
+        # add text conditioning, if present
+        if exists(context):
+            assert exists(self.to_context)
+            ck, cv = self.to_context(context).chunk(2, dim = -1)
+            k = torch.cat((ck, k), dim = -2)
+            v = torch.cat((cv, v), dim = -2)
+        # cosine sim attention
+        if self.cosine_sim_attn:
+            q, k = map(l2norm, (q, k))
+        # calculate query / key similarities
+        sim = einsum('b h i d, b j d -> b h i j', q, k) * self.cosine_sim_scale
+        # relative positional encoding (T5 style)
+        if exists(attn_bias):
+            null_attn_bias = repeat(self.null_attn_bias, 'h -> h n 1', n = n)
+            attn_bias = torch.cat((null_attn_bias, attn_bias), dim = -1)
+            sim = sim + attn_bias
+        # masking
+        max_neg_value = -torch.finfo(sim.dtype).max
+        if self.causal:
+            i, j = sim.shape[-2:]
+            causal_mask = torch.ones((i, j), device = device, dtype = torch.bool).triu(j - i + 1)
+            sim = sim.masked_fill(causal_mask, max_neg_value)
+        if exists(mask):
+            mask = F.pad(mask, (1, 0), value = True)
+            mask = rearrange(mask, 'b j -> b 1 1 j')
+            sim = sim.masked_fill(~mask, max_neg_value)
+        # attention
+        attn = sim.softmax(dim = -1)
+        # aggregate values
+        out = einsum('b h i j, b j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+# pseudo conv2d that uses conv3d but with kernel size of 1 across frames dimension
+def Conv2d(dim_in, dim_out, kernel, stride = 1, padding = 0, **kwargs):
+    kernel = cast_tuple(kernel, 2)
+    stride = cast_tuple(stride, 2)
+    padding = cast_tuple(padding, 2)
+    if len(kernel) == 2:
+        kernel = (1, *kernel)
+    if len(stride) == 2:
+        stride = (1, *stride)
+    if len(padding) == 2:
+        padding = (0, *padding)
+    return nn.Conv3d(dim_in, dim_out, kernel, stride = stride, padding = padding, **kwargs)
+class Pad(nn.Module):
+    def __init__(self, padding, value = 0.):
+        super().__init__()
+        self.padding = padding
+        self.value = value
+    def forward(self, x):
+        return F.pad(x, self.padding, value = self.value)
+# decoder
+def Upsample(dim, dim_out = None):
+    dim_out = default(dim_out, dim)
+    return nn.Sequential(
+        nn.Upsample(scale_factor = 2, mode = 'nearest'),
+        Conv2d(dim, dim_out, 3, padding = 1)
+    )
+class PixelShuffleUpsample(nn.Module):
+    def __init__(self, dim, dim_out = None):
+        super().__init__()
+        dim_out = default(dim_out, dim)
+        conv = Conv2d(dim, dim_out * 4, 1)
+        self.net = nn.Sequential(
+            conv,
+            nn.SiLU()
+        )
+        self.pixel_shuffle = nn.PixelShuffle(2)
+        self.init_conv_(conv)
+    def init_conv_(self, conv):
+        o, i, f, h, w = conv.weight.shape
+        conv_weight = torch.empty(o // 4, i, f, h, w)
+        nn.init.kaiming_uniform_(conv_weight)
+        conv_weight = repeat(conv_weight, 'o ... -> (o 4) ...')
+        conv.weight.data.copy_(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def forward(self, x):
+        out = self.net(x)
+        frames = x.shape[2]
+        out = rearrange(out, 'b c f h w -> (b f) c h w')
+        out = self.pixel_shuffle(out)
+        return rearrange(out, '(b f) c h w -> b c f h w', f = frames)
+def Downsample(dim, dim_out = None):
+    dim_out = default(dim_out, dim)
+    return Conv2d(dim, dim_out, 4, 2, 1)
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device = x.device) * -emb)
+        emb = rearrange(x, 'i -> i 1') * rearrange(emb, 'j -> 1 j')
+        return torch.cat((emb.sin(), emb.cos()), dim = -1)
+class LearnedSinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        self.weights = nn.Parameter(torch.randn(half_dim))
+    def forward(self, x):
+        x = rearrange(x, 'b -> b 1')
+        freqs = x * rearrange(self.weights, 'd -> 1 d') * 2 * math.pi
+        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim = -1)
+        fouriered = torch.cat((x, fouriered), dim = -1)
+        return fouriered
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        groups = 8,
+        norm = True
+    ):
+        super().__init__()
+        self.groupnorm = nn.GroupNorm(groups, dim) if norm else Identity()
+        self.activation = nn.SiLU()
+        self.project = Conv2d(dim, dim_out, 3, padding = 1)
+    def forward(self, x, scale_shift = None):
+        x = self.groupnorm(x)
+        if exists(scale_shift):
+            scale, shift = scale_shift
+            x = x * (scale + 1) + shift
+        x = self.activation(x)
+        return self.project(x)
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out,
+        *,
+        cond_dim = None,
+        time_cond_dim = None,
+        groups = 8,
+        linear_attn = False,
+        use_gca = False,
+        squeeze_excite = False,
+        **attn_kwargs
+    ):
+        super().__init__()
+        self.time_mlp = None
+        if exists(time_cond_dim):
+            self.time_mlp = nn.Sequential(
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, dim_out * 2)
+            )
+        self.cross_attn = None
+        if exists(cond_dim):
+            attn_klass = CrossAttention if not linear_attn else LinearCrossAttention
+            self.cross_attn = EinopsToAndFrom(
+                'b c f h w',
+                'b (f h w) c',
+                attn_klass(
+                    dim = dim_out,
+                    context_dim = cond_dim,
+                    **attn_kwargs
+                )
+            )
+        self.block1 = Block(dim, dim_out, groups = groups)
+        self.block2 = Block(dim_out, dim_out, groups = groups)
+        self.gca = GlobalContext(dim_in = dim_out, dim_out = dim_out) if use_gca else Always(1)
+        self.res_conv = Conv2d(dim, dim_out, 1) if dim != dim_out else Identity()
+    def forward(self, x, time_emb = None, cond = None):
+        scale_shift = None
+        if exists(self.time_mlp) and exists(time_emb):
+            time_emb = self.time_mlp(time_emb)
+            time_emb = rearrange(time_emb, 'b c -> b c 1 1 1')
+            scale_shift = time_emb.chunk(2, dim = 1)
+        h = self.block1(x)
+        if exists(self.cross_attn):
+            assert exists(cond)
+            h = self.cross_attn(h, context = cond) + h
+        h = self.block2(h, scale_shift = scale_shift)
+        h = h * self.gca(h)
+        return h + self.res_conv(x)
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        context_dim = None,
+        dim_head = 64,
+        heads = 8,
+        norm_context = False,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5 if not cosine_sim_attn else 1.
+        self.cosine_sim_attn = cosine_sim_attn
+        self.cosine_sim_scale = 16 if cosine_sim_attn else 1
+        self.heads = heads
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, dim)
+        self.norm = LayerNorm(dim)
+        self.norm_context = LayerNorm(context_dim) if norm_context else Identity()
+        self.null_kv = nn.Parameter(torch.randn(2, dim_head))
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, dim, bias = False),
+            LayerNorm(dim)
+        )
+    def forward(self, x, context, mask = None):
+        b, n, device = *x.shape[:2], x.device
+        x = self.norm(x)
+        context = self.norm_context(context)
+        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> b h n d', h = self.heads)
+        # add null key / value for classifier free guidance in prior net
+        nk, nv = repeat_many(self.null_kv.unbind(dim = -2), 'd -> b h 1 d', h = self.heads,  b = b)
+        k = torch.cat((nk, k), dim = -2)
+        v = torch.cat((nv, v), dim = -2)
+        q = q * self.scale
+        # cosine sim attention
+        if self.cosine_sim_attn:
+            q, k = map(l2norm, (q, k))
+        # similarities
+        sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.cosine_sim_scale
+        # masking
+        max_neg_value = -torch.finfo(sim.dtype).max
+        if exists(mask):
+            mask = F.pad(mask, (1, 0), value = True)
+            mask = rearrange(mask, 'b j -> b 1 1 j')
+            sim = sim.masked_fill(~mask, max_neg_value)
+        attn = sim.softmax(dim = -1, dtype = torch.float32)
+        out = einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+class LinearCrossAttention(CrossAttention):
+    def forward(self, x, context, mask = None):
+        b, n, device = *x.shape[:2], x.device
+        x = self.norm(x)
+        context = self.norm_context(context)
+        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
+        q, k, v = rearrange_many((q, k, v), 'b n (h d) -> (b h) n d', h = self.heads)
+        # add null key / value for classifier free guidance in prior net
+        nk, nv = repeat_many(self.null_kv.unbind(dim = -2), 'd -> (b h) 1 d', h = self.heads,  b = b)
+        k = torch.cat((nk, k), dim = -2)
+        v = torch.cat((nv, v), dim = -2)
+        # masking
+        max_neg_value = -torch.finfo(x.dtype).max
+        if exists(mask):
+            mask = F.pad(mask, (1, 0), value = True)
+            mask = rearrange(mask, 'b n -> b n 1')
+            k = k.masked_fill(~mask, max_neg_value)
+            v = v.masked_fill(~mask, 0.)
+        # linear attention
+        q = q.softmax(dim = -1)
+        k = k.softmax(dim = -2)
+        q = q * self.scale
+        context = einsum('b n d, b n e -> b d e', k, v)
+        out = einsum('b n d, b d e -> b n e', q, context)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h = self.heads)
+        return self.to_out(out)
+class LinearAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_head = 32,
+        heads = 8,
+        dropout = 0.05,
+        context_dim = None,
+        **kwargs
+    ):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm = ChanLayerNorm(dim)
+        self.nonlin = nn.SiLU()
+        self.to_q = nn.Sequential(
+            nn.Dropout(dropout),
+            Conv2d(dim, inner_dim, 1, bias = False),
+            Conv2d(inner_dim, inner_dim, 3, bias = False, padding = 1, groups = inner_dim)
+        )
+        self.to_k = nn.Sequential(
+            nn.Dropout(dropout),
+            Conv2d(dim, inner_dim, 1, bias = False),
+            Conv2d(inner_dim, inner_dim, 3, bias = False, padding = 1, groups = inner_dim)
+        )
+        self.to_v = nn.Sequential(
+            nn.Dropout(dropout),
+            Conv2d(dim, inner_dim, 1, bias = False),
+            Conv2d(inner_dim, inner_dim, 3, bias = False, padding = 1, groups = inner_dim)
+        )
+        self.to_context = nn.Sequential(nn.LayerNorm(context_dim), nn.Linear(context_dim, inner_dim * 2, bias = False)) if exists(context_dim) else None
+        self.to_out = nn.Sequential(
+            Conv2d(inner_dim, dim, 1, bias = False),
+            ChanLayerNorm(dim)
+        )
+    def forward(self, fmap, context = None):
+        h, x, y = self.heads, *fmap.shape[-2:]
+        fmap = self.norm(fmap)
+        q, k, v = map(lambda fn: fn(fmap), (self.to_q, self.to_k, self.to_v))
+        q, k, v = rearrange_many((q, k, v), 'b (h c) x y -> (b h) (x y) c', h = h)
+        if exists(context):
+            assert exists(self.to_context)
+            ck, cv = self.to_context(context).chunk(2, dim = -1)
+            ck, cv = rearrange_many((ck, cv), 'b n (h d) -> (b h) n d', h = h)
+            k = torch.cat((k, ck), dim = -2)
+            v = torch.cat((v, cv), dim = -2)
+        q = q.softmax(dim = -1)
+        k = k.softmax(dim = -2)
+        q = q * self.scale
+        context = einsum('b n d, b n e -> b d e', k, v)
+        out = einsum('b n d, b d e -> b n e', q, context)
+        out = rearrange(out, '(b h) (x y) d -> b (h d) x y', h = h, x = x, y = y)
+        out = self.nonlin(out)
+        return self.to_out(out)
+class GlobalContext(nn.Module):
+    """ basically a superior form of squeeze-excitation that is attention-esque """
+    def __init__(
+        self,
+        *,
+        dim_in,
+        dim_out
+    ):
+        super().__init__()
+        self.to_k = Conv2d(dim_in, 1, 1)
+        hidden_dim = max(3, dim_out // 2)
+        self.net = nn.Sequential(
+            Conv2d(dim_in, hidden_dim, 1),
+            nn.SiLU(),
+            Conv2d(hidden_dim, dim_out, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        context = self.to_k(x)
+        x, context = rearrange_many((x, context), 'b n ... -> b n (...)')
+        out = einsum('b i n, b c n -> b c i', context.softmax(dim = -1), x)
+        out = rearrange(out, '... -> ... 1 1')
+        return self.net(out)
+def FeedForward(dim, mult = 2):
+    hidden_dim = int(dim * mult)
+    return nn.Sequential(
+        LayerNorm(dim),
+        nn.Linear(dim, hidden_dim, bias = False),
+        nn.GELU(),
+        LayerNorm(hidden_dim),
+        nn.Linear(hidden_dim, dim, bias = False)
+    )
+def ChanFeedForward(dim, mult = 2):  # in paper, it seems for self attention layers they did feedforwards with twice channel width
+    hidden_dim = int(dim * mult)
+    return nn.Sequential(
+        ChanLayerNorm(dim),
+        Conv2d(dim, hidden_dim, 1, bias = False),
+        nn.GELU(),
+        ChanLayerNorm(hidden_dim),
+        Conv2d(hidden_dim, dim, 1, bias = False)
+    )
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        depth = 1,
+        heads = 8,
+        dim_head = 32,
+        ff_mult = 2,
+        context_dim = None,
+        cosine_sim_attn = False
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                EinopsToAndFrom('b c f h w', 'b (f h w) c', Attention(dim = dim, heads = heads, dim_head = dim_head, context_dim = context_dim, cosine_sim_attn = cosine_sim_attn)),
+                ChanFeedForward(dim = dim, mult = ff_mult)
+            ]))
+    def forward(self, x, context = None):
+        for attn, ff in self.layers:
+            x = attn(x, context = context) + x
+            x = ff(x) + x
+        return x
+class LinearAttentionTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        depth = 1,
+        heads = 8,
+        dim_head = 32,
+        ff_mult = 2,
+        context_dim = None,
+        **kwargs
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(nn.ModuleList([
+                LinearAttention(dim = dim, heads = heads, dim_head = dim_head, context_dim = context_dim),
+                ChanFeedForward(dim = dim, mult = ff_mult)
+            ]))
+    def forward(self, x, context = None):
+        for attn, ff in self.layers:
+            x = attn(x, context = context) + x
+            x = ff(x) + x
+        return x
+class CrossEmbedLayer(nn.Module):
+    def __init__(
+        self,
+        dim_in,
+        kernel_sizes,
+        dim_out = None,
+        stride = 2
+    ):
+        super().__init__()
+        assert all([*map(lambda t: (t % 2) == (stride % 2), kernel_sizes)])
+        dim_out = default(dim_out, dim_in)
+        kernel_sizes = sorted(kernel_sizes)
+        num_scales = len(kernel_sizes)
+        # calculate the dimension at each scale
+        dim_scales = [int(dim_out / (2 ** i)) for i in range(1, num_scales)]
+        dim_scales = [*dim_scales, dim_out - sum(dim_scales)]
+        self.convs = nn.ModuleList([])
+        for kernel, dim_scale in zip(kernel_sizes, dim_scales):
+            self.convs.append(Conv2d(dim_in, dim_scale, kernel, stride = stride, padding = (kernel - stride) // 2))
+    def forward(self, x):
+        fmaps = tuple(map(lambda conv: conv(x), self.convs))
+        return torch.cat(fmaps, dim = 1)
+class UpsampleCombiner(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        enabled = False,
+        dim_ins = tuple(),
+        dim_outs = tuple()
+    ):
+        super().__init__()
+        dim_outs = cast_tuple(dim_outs, len(dim_ins))
+        assert len(dim_ins) == len(dim_outs)
+        self.enabled = enabled
+        if not self.enabled:
+            self.dim_out = dim
+            return
+        self.fmap_convs = nn.ModuleList([Block(dim_in, dim_out) for dim_in, dim_out in zip(dim_ins, dim_outs)])
+        self.dim_out = dim + (sum(dim_outs) if len(dim_outs) > 0 else 0)
+    def forward(self, x, fmaps = None):
+        target_size = x.shape[-1]
+        fmaps = default(fmaps, tuple())
+        if not self.enabled or len(fmaps) == 0 or len(self.fmap_convs) == 0:
+            return x
+        fmaps = [resize_video_to(fmap, target_size) for fmap in fmaps]
+        outs = [conv(fmap) for fmap, conv in zip(fmaps, self.fmap_convs)]
+        return torch.cat((x, *outs), dim = 1)
+class DynamicPositionBias(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        heads,
+        depth
+    ):
+        super().__init__()
+        self.mlp = nn.ModuleList([])
+        self.mlp.append(nn.Sequential(
+            nn.Linear(1, dim),
+            LayerNorm(dim),
+            nn.SiLU()
+        ))
+        for _ in range(max(depth - 1, 0)):
+            self.mlp.append(nn.Sequential(
+                nn.Linear(dim, dim),
+                LayerNorm(dim),
+                nn.SiLU()
+            ))
+        self.mlp.append(nn.Linear(dim, heads))
+    def forward(self, n, device, dtype):
+        i = torch.arange(n, device = device)
+        j = torch.arange(n, device = device)
+        indices = rearrange(i, 'i -> i 1') - rearrange(j, 'j -> 1 j')
+        indices += (n - 1)
+        pos = torch.arange(-n + 1, n, device = device, dtype = dtype)
+        pos = rearrange(pos, '... -> ... 1')
+        for layer in self.mlp:
+            pos = layer(pos)
+        bias = pos[indices]
+        bias = rearrange(bias, 'i j h -> h i j')
+        return bias
+class Unet3D(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        image_embed_dim = 1024,
+        text_embed_dim = get_encoded_dim(DEFAULT_T5_NAME),
+        num_resnet_blocks = 1,
+        cond_dim = None,
+        num_image_tokens = 4,
+        num_time_tokens = 2,
+        learned_sinu_pos_emb_dim = 16,
+        out_dim = None,
+        dim_mults=(1, 2, 4, 8),
+        cond_images_channels = 0,
+        channels = 3,
+        channels_out = None,
+        attn_dim_head = 64,
+        attn_heads = 8,
+        ff_mult = 2.,
+        lowres_cond = False,                # for cascading diffusion - https://cascaded-diffusion.github.io/
+        layer_attns = False,
+        layer_attns_depth = 1,
+        layer_attns_add_text_cond = True,   # whether to condition the self-attention blocks with the text embeddings, as described in Appendix D.3.1
+        attend_at_middle = True,            # whether to have a layer of attention at the bottleneck (can turn off for higher resolution in cascading DDPM, before bringing in efficient attention)
+        time_rel_pos_bias_depth = 2,
+        time_causal_attn = True,
+        layer_cross_attns = True,
+        use_linear_attn = False,
+        use_linear_cross_attn = False,
+        cond_on_text = True,
+        max_text_len = 256,
+        init_dim = None,
+        resnet_groups = 8,
+        init_conv_kernel_size = 7,          # kernel size of initial conv, if not using cross embed
+        init_cross_embed = True,
+        init_cross_embed_kernel_sizes = (3, 7, 15),
+        cross_embed_downsample = False,
+        cross_embed_downsample_kernel_sizes = (2, 4),
+        attn_pool_text = True,
+        attn_pool_num_latents = 32,
+        dropout = 0.,
+        memory_efficient = False,
+        init_conv_to_final_conv_residual = False,
+        use_global_context_attn = True,
+        scale_skip_connection = True,
+        final_resnet_block = True,
+        final_conv_kernel_size = 3,
+        cosine_sim_attn = False,
+        self_cond = False,
+        combine_upsample_fmaps = False,      # combine feature maps from all upsample blocks, used in unet squared successfully
+        pixel_shuffle_upsample = True        # may address checkboard artifacts
+    ):
+        super().__init__()
+        # guide researchers
+        assert attn_heads > 1, 'you need to have more than 1 attention head, ideally at least 4 or 8'
+        if dim < 128:
+            print_once('The base dimension of your u-net should ideally be no smaller than 128, as recommended by a professional DDPM trainer https://nonint.com/2022/05/04/friends-dont-let-friends-train-small-diffusion-models/')
+        # save locals to take care of some hyperparameters for cascading DDPM
+        self._locals = locals()
+        self._locals.pop('self', None)
+        self._locals.pop('__class__', None)
+        self.self_cond = self_cond
+        # determine dimensions
+        self.channels = channels
+        self.channels_out = default(channels_out, channels)
+        # (1) in cascading diffusion, one concats the low resolution image, blurred, for conditioning the higher resolution synthesis
+        # (2) in self conditioning, one appends the predict x0 (x_start)
+        init_channels = channels * (1 + int(lowres_cond) + int(self_cond))
+        init_dim = default(init_dim, dim)
+        # optional image conditioning
+        self.has_cond_image = cond_images_channels > 0
+        self.cond_images_channels = cond_images_channels
+        init_channels += cond_images_channels
+        # initial convolution
+        self.init_conv = CrossEmbedLayer(init_channels, dim_out = init_dim, kernel_sizes = init_cross_embed_kernel_sizes, stride = 1) if init_cross_embed else Conv2d(init_channels, init_dim, init_conv_kernel_size, padding = init_conv_kernel_size // 2)
+        dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        # time conditioning
+        cond_dim = default(cond_dim, dim)
+        time_cond_dim = dim * 4 * (2 if lowres_cond else 1)
+        # embedding time for log(snr) noise from continuous version
+        sinu_pos_emb = LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim)
+        sinu_pos_emb_input_dim = learned_sinu_pos_emb_dim + 1
+        self.to_time_hiddens = nn.Sequential(
+            sinu_pos_emb,
+            nn.Linear(sinu_pos_emb_input_dim, time_cond_dim),
+            nn.SiLU()
+        )
+        self.to_time_cond = nn.Sequential(
+            nn.Linear(time_cond_dim, time_cond_dim)
+        )
+        # project to time tokens as well as time hiddens
+        self.to_time_tokens = nn.Sequential(
+            nn.Linear(time_cond_dim, cond_dim * num_time_tokens),
+            Rearrange('b (r d) -> b r d', r = num_time_tokens)
+        )
+        # low res aug noise conditioning
+        self.lowres_cond = lowres_cond
+        if lowres_cond:
+            self.to_lowres_time_hiddens = nn.Sequential(
+                LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim),
+                nn.Linear(learned_sinu_pos_emb_dim + 1, time_cond_dim),
+                nn.SiLU()
+            )
+            self.to_lowres_time_cond = nn.Sequential(
+                nn.Linear(time_cond_dim, time_cond_dim)
+            )
+            self.to_lowres_time_tokens = nn.Sequential(
+                nn.Linear(time_cond_dim, cond_dim * num_time_tokens),
+                Rearrange('b (r d) -> b r d', r = num_time_tokens)
+            )
+        # normalizations
+        self.norm_cond = nn.LayerNorm(cond_dim)
+        # text encoding conditioning (optional)
+        self.text_to_cond = None
+        if cond_on_text:
+            assert exists(text_embed_dim), 'text_embed_dim must be given to the unet if cond_on_text is True'
+            self.text_to_cond = nn.Linear(text_embed_dim, cond_dim)
+        # finer control over whether to condition on text encodings
+        self.cond_on_text = cond_on_text
+        # attention pooling
+        self.attn_pool = PerceiverResampler(dim = cond_dim, depth = 2, dim_head = attn_dim_head, heads = attn_heads, num_latents = attn_pool_num_latents, cosine_sim_attn = cosine_sim_attn) if attn_pool_text else None
+        # for classifier free guidance
+        self.max_text_len = max_text_len
+        self.null_text_embed = nn.Parameter(torch.randn(1, max_text_len, cond_dim))
+        self.null_text_hidden = nn.Parameter(torch.randn(1, time_cond_dim))
+        # for non-attention based text conditioning at all points in the network where time is also conditioned
+        self.to_text_non_attn_cond = None
+        if cond_on_text:
+            self.to_text_non_attn_cond = nn.Sequential(
+                nn.LayerNorm(cond_dim),
+                nn.Linear(cond_dim, time_cond_dim),
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, time_cond_dim)
+            )
+        # attention related params
+        attn_kwargs = dict(heads = attn_heads, dim_head = attn_dim_head, cosine_sim_attn = cosine_sim_attn)
+        num_layers = len(in_out)
+        # temporal attention - attention across video frames
+        temporal_peg_padding = (0, 0, 0, 0, 2, 0) if time_causal_attn else (0, 0, 0, 0, 1, 1)
+        temporal_peg = lambda dim: Residual(nn.Sequential(Pad(temporal_peg_padding), nn.Conv3d(dim, dim, (3, 1, 1), groups = dim)))
+        temporal_attn = lambda dim: EinopsToAndFrom('b c f h w', '(b h w) f c', Residual(Attention(dim, **{**attn_kwargs, 'causal': time_causal_attn})))
+        # temporal attention relative positional encoding
+        self.time_rel_pos_bias = DynamicPositionBias(dim = dim * 2, heads = attn_heads, depth = time_rel_pos_bias_depth)
+        # resnet block klass
+        num_resnet_blocks = cast_tuple(num_resnet_blocks, num_layers)
+        resnet_groups = cast_tuple(resnet_groups, num_layers)
+        resnet_klass = partial(ResnetBlock, **attn_kwargs)
+        layer_attns = cast_tuple(layer_attns, num_layers)
+        layer_attns_depth = cast_tuple(layer_attns_depth, num_layers)
+        layer_cross_attns = cast_tuple(layer_cross_attns, num_layers)
+        assert all([layers == num_layers for layers in list(map(len, (resnet_groups, layer_attns, layer_cross_attns)))])
+        # downsample klass
+        downsample_klass = Downsample
+        if cross_embed_downsample:
+            downsample_klass = partial(CrossEmbedLayer, kernel_sizes = cross_embed_downsample_kernel_sizes)
+        # initial resnet block (for memory efficient unet)
+        self.init_resnet_block = resnet_klass(init_dim, init_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[0], use_gca = use_global_context_attn) if memory_efficient else None
+        self.init_temporal_peg = temporal_peg(init_dim)
+        self.init_temporal_attn = temporal_attn(init_dim)
+        # scale for resnet skip connections
+        self.skip_connect_scale = 1. if not scale_skip_connection else (2 ** -0.5)
+        # layers
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+        num_resolutions = len(in_out)
+        layer_params = [num_resnet_blocks, resnet_groups, layer_attns, layer_attns_depth, layer_cross_attns]
+        reversed_layer_params = list(map(reversed, layer_params))
+        # downsampling layers
+        skip_connect_dims = [] # keep track of skip connection dimensions
+        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn, layer_attn_depth, layer_cross_attn) in enumerate(zip(in_out, *layer_params)):
+            is_last = ind >= (num_resolutions - 1)
+            layer_use_linear_cross_attn = not layer_cross_attn and use_linear_cross_attn
+            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None
+            transformer_block_klass = TransformerBlock if layer_attn else (LinearAttentionTransformerBlock if use_linear_attn else Identity)
+            current_dim = dim_in
+            # whether to pre-downsample, from memory efficient unet
+            pre_downsample = None
+            if memory_efficient:
+                pre_downsample = downsample_klass(dim_in, dim_out)
+                current_dim = dim_out
+            skip_connect_dims.append(current_dim)
+            # whether to do post-downsample, for non-memory efficient unet
+            post_downsample = None
+            if not memory_efficient:
+                post_downsample = downsample_klass(current_dim, dim_out) if not is_last else Parallel(Conv2d(dim_in, dim_out, 3, padding = 1), Conv2d(dim_in, dim_out, 1))
+            self.downs.append(nn.ModuleList([
+                pre_downsample,
+                resnet_klass(current_dim, current_dim, cond_dim = layer_cond_dim, linear_attn = layer_use_linear_cross_attn, time_cond_dim = time_cond_dim, groups = groups),
+                nn.ModuleList([ResnetBlock(current_dim, current_dim, time_cond_dim = time_cond_dim, groups = groups, use_gca = use_global_context_attn) for _ in range(layer_num_resnet_blocks)]),
+                transformer_block_klass(dim = current_dim, depth = layer_attn_depth, ff_mult = ff_mult, context_dim = cond_dim, **attn_kwargs),
+                temporal_peg(current_dim),
+                temporal_attn(current_dim),
+                post_downsample
+            ]))
+        # middle layers
+        mid_dim = dims[-1]
+        self.mid_block1 = ResnetBlock(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[-1])
+        self.mid_attn = EinopsToAndFrom('b c f h w', 'b (f h w) c', Residual(Attention(mid_dim, **attn_kwargs))) if attend_at_middle else None
+        self.mid_temporal_peg = temporal_peg(mid_dim)
+        self.mid_temporal_attn = temporal_attn(mid_dim)
+        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, cond_dim = cond_dim, time_cond_dim = time_cond_dim, groups = resnet_groups[-1])
+        # upsample klass
+        upsample_klass = Upsample if not pixel_shuffle_upsample else PixelShuffleUpsample
+        # upsampling layers
+        upsample_fmap_dims = []
+        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn, layer_attn_depth, layer_cross_attn) in enumerate(zip(reversed(in_out), *reversed_layer_params)):
+            is_last = ind == (len(in_out) - 1)
+            layer_use_linear_cross_attn = not layer_cross_attn and use_linear_cross_attn
+            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None
+            transformer_block_klass = TransformerBlock if layer_attn else (LinearAttentionTransformerBlock if use_linear_attn else Identity)
+            skip_connect_dim = skip_connect_dims.pop()
+            upsample_fmap_dims.append(dim_out)
+            self.ups.append(nn.ModuleList([
+                resnet_klass(dim_out + skip_connect_dim, dim_out, cond_dim = layer_cond_dim, linear_attn = layer_use_linear_cross_attn, time_cond_dim = time_cond_dim, groups = groups),
+                nn.ModuleList([ResnetBlock(dim_out + skip_connect_dim, dim_out, time_cond_dim = time_cond_dim, groups = groups, use_gca = use_global_context_attn) for _ in range(layer_num_resnet_blocks)]),
+                transformer_block_klass(dim = dim_out, depth = layer_attn_depth, ff_mult = ff_mult, context_dim = cond_dim, **attn_kwargs),
+                temporal_peg(dim_out),
+                temporal_attn(dim_out),
+                upsample_klass(dim_out, dim_in) if not is_last or memory_efficient else Identity()
+            ]))
+        # whether to combine feature maps from all upsample blocks before final resnet block out
+        self.upsample_combiner = UpsampleCombiner(
+            dim = dim,
+            enabled = combine_upsample_fmaps,
+            dim_ins = upsample_fmap_dims,
+            dim_outs = dim
+        )
+        # whether to do a final residual from initial conv to the final resnet block out
+        self.init_conv_to_final_conv_residual = init_conv_to_final_conv_residual
+        final_conv_dim = self.upsample_combiner.dim_out + (dim if init_conv_to_final_conv_residual else 0)
+        # final optional resnet block and convolution out
+        self.final_res_block = ResnetBlock(final_conv_dim, dim, time_cond_dim = time_cond_dim, groups = resnet_groups[0], use_gca = True) if final_resnet_block else None
+        final_conv_dim_in = dim if final_resnet_block else final_conv_dim
+        final_conv_dim_in += (channels if lowres_cond else 0)
+        self.final_conv = Conv2d(final_conv_dim_in, self.channels_out, final_conv_kernel_size, padding = final_conv_kernel_size // 2)
+        zero_init_(self.final_conv)
+    # if the current settings for the unet are not correct
+    # for cascading DDPM, then reinit the unet with the right settings
+    def cast_model_parameters(
+        self,
+        *,
+        lowres_cond,
+        text_embed_dim,
+        channels,
+        channels_out,
+        cond_on_text
+    ):
+        if lowres_cond == self.lowres_cond and \
+            channels == self.channels and \
+            cond_on_text == self.cond_on_text and \
+            text_embed_dim == self._locals['text_embed_dim'] and \
+            channels_out == self.channels_out:
+            return self
+        updated_kwargs = dict(
+            lowres_cond = lowres_cond,
+            text_embed_dim = text_embed_dim,
+            channels = channels,
+            channels_out = channels_out,
+            cond_on_text = cond_on_text
+        )
+        return self.__class__(**{**self._locals, **updated_kwargs})
+    # methods for returning the full unet config as well as its parameter state
+    def to_config_and_state_dict(self):
+        return self._locals, self.state_dict()
+    # class method for rehydrating the unet from its config and state dict
+    @classmethod
+    def from_config_and_state_dict(klass, config, state_dict):
+        unet = klass(**config)
+        unet.load_state_dict(state_dict)
+        return unet
+    # methods for persisting unet to disk
+    def persist_to_file(self, path):
+        path = Path(path)
+        path.parents[0].mkdir(exist_ok = True, parents = True)
+        config, state_dict = self.to_config_and_state_dict()
+        pkg = dict(config = config, state_dict = state_dict)
+        torch.save(pkg, str(path))
+    # class method for rehydrating the unet from file saved with `persist_to_file`
+    @classmethod
+    def hydrate_from_file(klass, path):
+        path = Path(path)
+        assert path.exists()
+        pkg = torch.load(str(path))
+        assert 'config' in pkg and 'state_dict' in pkg
+        config, state_dict = pkg['config'], pkg['state_dict']
+        return Unet.from_config_and_state_dict(config, state_dict)
+    # forward with classifier free guidance
+    def forward_with_cond_scale(
+        self,
+        *args,
+        cond_scale = 1.,
+        **kwargs
+    ):
+        logits = self.forward(*args, **kwargs)
+        if cond_scale == 1:
+            return logits
+        null_logits = self.forward(*args, cond_drop_prob = 1., **kwargs)
+        return null_logits + (logits - null_logits) * cond_scale
+    def forward(
+        self,
+        x,
+        time,
+        *,
+        lowres_cond_img = None,
+        lowres_noise_times = None,
+        text_embeds = None,
+        text_mask = None,
+        cond_images = None,
+        self_cond = None,
+        cond_drop_prob = 0.
+    ):
+        assert x.ndim == 5, 'input to 3d unet must have 5 dimensions (batch, channels, time, height, width)'
+        batch_size, frames, device, dtype = x.shape[0], x.shape[2], x.device, x.dtype
+        # add self conditioning if needed
+        if self.self_cond:
+            self_cond = default(self_cond, lambda: torch.zeros_like(x))
+            x = torch.cat((x, self_cond), dim = 1)
+        # add low resolution conditioning, if present
+        assert not (self.lowres_cond and not exists(lowres_cond_img)), 'low resolution conditioning image must be present'
+        assert not (self.lowres_cond and not exists(lowres_noise_times)), 'low resolution conditioning noise time must be present'
+        if exists(lowres_cond_img):
+            x = torch.cat((x, lowres_cond_img), dim = 1)
+        # condition on input image
+        assert not (self.has_cond_image ^ exists(cond_images)), 'you either requested to condition on an image on the unet, but the conditioning image is not supplied, or vice versa'
+        if exists(cond_images):
+            assert cond_images.shape[1] == self.cond_images_channels, 'the number of channels on the conditioning image you are passing in does not match what you specified on initialiation of the unet'
+            cond_images = resize_video_to(cond_images, x.shape[-1])
+            x = torch.cat((cond_images, x), dim = 1)
+        # get time relative positions
+        time_attn_bias = self.time_rel_pos_bias(frames, device = device, dtype = dtype)
+        # initial convolution
+        x = self.init_conv(x)
+        x = self.init_temporal_peg(x)
+        x = self.init_temporal_attn(x, attn_bias = time_attn_bias)
+        # init conv residual
+        if self.init_conv_to_final_conv_residual:
+            init_conv_residual = x.clone()
+        # time conditioning
+        time_hiddens = self.to_time_hiddens(time)
+        # derive time tokens
+        time_tokens = self.to_time_tokens(time_hiddens)
+        t = self.to_time_cond(time_hiddens)
+        # add lowres time conditioning to time hiddens
+        # and add lowres time tokens along sequence dimension for attention
+        if self.lowres_cond:
+            lowres_time_hiddens = self.to_lowres_time_hiddens(lowres_noise_times)
+            lowres_time_tokens = self.to_lowres_time_tokens(lowres_time_hiddens)
+            lowres_t = self.to_lowres_time_cond(lowres_time_hiddens)
+            t = t + lowres_t
+            time_tokens = torch.cat((time_tokens, lowres_time_tokens), dim = -2)
+        # text conditioning
+        text_tokens = None
+        if exists(text_embeds) and self.cond_on_text:
+            # conditional dropout
+            text_keep_mask = prob_mask_like((batch_size,), 1 - cond_drop_prob, device = device)
+            text_keep_mask_embed = rearrange(text_keep_mask, 'b -> b 1 1')
+            text_keep_mask_hidden = rearrange(text_keep_mask, 'b -> b 1')
+            # calculate text embeds
+            text_tokens = self.text_to_cond(text_embeds)
+            text_tokens = text_tokens[:, :self.max_text_len]
+            if exists(text_mask):
+                text_mask = text_mask[:, :self.max_text_len]
+            text_tokens_len = text_tokens.shape[1]
+            remainder = self.max_text_len - text_tokens_len
+            if remainder > 0:
+                text_tokens = F.pad(text_tokens, (0, 0, 0, remainder))
+            if exists(text_mask):
+                if remainder > 0:
+                    text_mask = F.pad(text_mask, (0, remainder), value = False)
+                text_mask = rearrange(text_mask, 'b n -> b n 1')
+                text_keep_mask_embed = text_mask & text_keep_mask_embed
+            null_text_embed = self.null_text_embed.to(text_tokens.dtype) # for some reason pytorch AMP not working
+            text_tokens = torch.where(
+                text_keep_mask_embed,
+                text_tokens,
+                null_text_embed
+            )
+            if exists(self.attn_pool):
+                text_tokens = self.attn_pool(text_tokens)
+            # extra non-attention conditioning by projecting and then summing text embeddings to time
+            # termed as text hiddens
+            mean_pooled_text_tokens = text_tokens.mean(dim = -2)
+            text_hiddens = self.to_text_non_attn_cond(mean_pooled_text_tokens)
+            null_text_hidden = self.null_text_hidden.to(t.dtype)
+            text_hiddens = torch.where(
+                text_keep_mask_hidden,
+                text_hiddens,
+                null_text_hidden
+            )
+            t = t + text_hiddens
+        # main conditioning tokens (c)
+        c = time_tokens if not exists(text_tokens) else torch.cat((time_tokens, text_tokens), dim = -2)
+        # normalize conditioning tokens
+        c = self.norm_cond(c)
+        # initial resnet block (for memory efficient unet)
+        if exists(self.init_resnet_block):
+            x = self.init_resnet_block(x, t)
+        # go through the layers of the unet, down and up
+        hiddens = []
+        for pre_downsample, init_block, resnet_blocks, attn_block, temporal_peg, temporal_attn, post_downsample in self.downs:
+            if exists(pre_downsample):
+                x = pre_downsample(x)
+            x = init_block(x, t, c)
+            for resnet_block in resnet_blocks:
+                x = resnet_block(x, t)
+                hiddens.append(x)
+            x = attn_block(x, c)
+            x = temporal_peg(x)
+            x = temporal_attn(x, attn_bias = time_attn_bias)
+            hiddens.append(x)
+            if exists(post_downsample):
+                x = post_downsample(x)
+        x = self.mid_block1(x, t, c)
+        if exists(self.mid_attn):
+            x = self.mid_attn(x)
+        x = self.mid_temporal_peg(x)
+        x = self.mid_temporal_attn(x, attn_bias = time_attn_bias)
+        x = self.mid_block2(x, t, c)
+        add_skip_connection = lambda x: torch.cat((x, hiddens.pop() * self.skip_connect_scale), dim = 1)
+        up_hiddens = []
+        for init_block, resnet_blocks, attn_block, temporal_peg, temporal_attn, upsample in self.ups:
+            x = add_skip_connection(x)
+            x = init_block(x, t, c)
+            for resnet_block in resnet_blocks:
+                x = add_skip_connection(x)
+                x = resnet_block(x, t)
+            x = attn_block(x, c)
+            x = temporal_peg(x)
+            x = temporal_attn(x, attn_bias = time_attn_bias)
+            up_hiddens.append(x.contiguous())
+            x = upsample(x)
+        # whether to combine all feature maps from upsample blocks
+        x = self.upsample_combiner(x, up_hiddens)
+        # final top-most residual if needed
+        if self.init_conv_to_final_conv_residual:
+            x = torch.cat((x, init_conv_residual), dim = 1)
+        if exists(self.final_res_block):
+            x = self.final_res_block(x, t)
+        if exists(lowres_cond_img):
+            x = torch.cat((x, lowres_cond_img), dim = 1)
+        return self.final_conv(x)

imagen_pytorch/joint_imagen.py ADDED Viewed

	@@ -0,0 +1,1942 @@

+import math
+from contextlib import contextmanager, nullcontext
+from functools import partial
+from pathlib import Path
+from random import random
+from typing import List, Union
+import kornia.augmentation as K
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+from einops import rearrange, reduce, repeat
+from einops.layers.torch import Rearrange
+from einops_exts import check_shape, rearrange_many
+from einops_exts.torch import EinopsToAndFrom
+from torch import nn
+from torch.cuda.amp import autocast
+from torch.nn.parallel import DistributedDataParallel
+from tqdm.auto import tqdm
+from torch.special import expm1
+from imagen_pytorch.imagen_pytorch import (
+    Attention, CrossEmbedLayer, Downsample, GaussianDiffusionContinuousTimes,
+    Identity, LearnedSinusoidalPosEmb, LinearAttentionTransformerBlock,
+    NullUnet, Parallel, PerceiverResampler, PixelShuffleUpsample,
+    Residual, ResnetBlock, TransformerBlock, Upsample, UpsampleCombiner,
+    cast_tuple, cast_uint8_images_to_float, default, eval_decorator,
+    exists, first, identity, is_float_dtype, maybe, module_device,
+    normalize_neg_one_to_one, pad_tuple_to_length, print_once, prob_mask_like,
+    resize_image_to, right_pad_dims_to, unnormalize_zero_to_one, zero_init_,
+    beta_linear_log_snr, alpha_cosine_log_snr, log, log_snr_to_alpha_sigma)
+from imagen_pytorch.imagen_video.imagen_video import Unet3D, resize_video_to
+from imagen_pytorch.t5 import DEFAULT_T5_NAME, get_encoded_dim, t5_encode_text
+def log_1_min_a(a):
+    return torch.log(1 - a.exp() + 1e-40)
+def log_add_exp(a, b):
+    maximum = torch.max(a, b)
+    return maximum + torch.log(torch.exp(a - maximum) + torch.exp(b - maximum))
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def log_categorical(log_x_start, log_prob):
+    return (log_x_start.exp() * log_prob).sum(dim=1)
+def index_to_log_onehot(x, num_classes):
+    assert x.max().item() < num_classes, f'Error: {x.max().item()} >= {num_classes}'
+    if len(x.size()) == 4 and x.size(1) == 1:
+        x = x.squeeze(1)
+    x_onehot = F.one_hot(x, num_classes)
+    permute_order = (0, -1) + tuple(range(1, len(x.size())))
+    x_onehot = x_onehot.permute(permute_order)
+    log_x = torch.log(x_onehot.float().clamp(min=1e-30))
+    return log_x
+def log_onehot_to_index(log_x):
+    return log_x.argmax(1, keepdims=True)
+def sum_except_batch(x, num_dims=1):
+    '''
+    Sums all dimensions except the first.
+    Args:
+        x: Tensor, shape (batch_size, ...)
+        num_dims: int, number of batch dims (default=1)
+    Returns:
+        x_sum: Tensor, shape (batch_size,)
+    '''
+    return x.reshape(*x.shape[:num_dims], -1).sum(-1)
+@torch.jit.script
+def alpha_cosine_p_log_snr(t, p: float = 0.8, s: float = 0.008):
+    # not sure if this accounts for beta being clipped to 0.999 in discrete version
+    return -log((torch.cos((t + s) / (1 + s) * math.pi * 0.5) ** (-2 * p)) - 1, eps=1e-5)
+class MultinomialDiffusion(nn.Module):
+    def __init__(self, num_classes, *, noise_schedule, p=1.0, timesteps=1000):
+        super().__init__()
+        if noise_schedule == "linear":
+            self.log_snr = beta_linear_log_snr
+        elif noise_schedule == "cosine":
+            self.log_snr = alpha_cosine_log_snr
+        elif noise_schedule == "cosine_p":
+            self.log_snr = partial(alpha_cosine_p_log_snr, p=p)
+        else:
+            raise ValueError(f'invalid noise schedule {noise_schedule}')
+        cumprod_alpha = torch.tensor([log_snr_to_alpha_sigma(self.log_snr(
+            torch.tensor(t / timesteps)))[0] ** 2 for t in range(timesteps)])
+        alphas = cumprod_alpha / F.pad(cumprod_alpha, (1, 0), value=cumprod_alpha[0])[:-1]
+        self.register_buffer('log_alpha', torch.log(alphas))
+        self.register_buffer('log_1_min_alpha', log_1_min_a(self.log_alpha))
+        self.register_buffer('log_cumprod_alpha', torch.cumsum(self.log_alpha, axis=0))
+        self.register_buffer('log_1_min_cumprod_alpha', log_1_min_a(self.log_cumprod_alpha))
+        self.num_classes = num_classes
+        self.num_timesteps = timesteps
+    def change_times_dtype(self, t):
+        if t.dtype != torch.int64:
+            if ((t < 1) * (t > 0)).any():
+                return torch.floor(t * self.num_timesteps).to(torch.int64)
+            else:
+                return t.to(torch.int64)
+        return t
+    def get_times(self, batch_size, noise_level, *, device):
+        raise NotImplementedError
+        return torch.full((batch_size,), noise_level, device=device, dtype=torch.float32)
+    def sample_random_times(self, batch_size, max_thres=0.999, *, device):
+        raise NotImplementedError
+        return torch.zeros((batch_size,), device=device).float().uniform_(0, max_thres)
+    def get_condition(self, times):
+        raise NotImplementedError
+        return maybe(self.log_snr)(times)
+    def get_sampling_timesteps(self, batch, *, device):
+        raise NotImplementedError
+        times = torch.linspace(1., 0., self.num_timesteps + 1, device=device)
+        times = repeat(times, 't -> b t', b=batch)
+        times = torch.stack((times[:, :-1], times[:, 1:]), dim=0)
+        times = times.unbind(dim=-1)
+        return times
+    def q_pred(self, log_x_start, t):
+        log_cumprod_alpha_t = extract(self.log_cumprod_alpha, t, log_x_start.shape)
+        log_1_min_cumprod_alpha = extract(self.log_1_min_cumprod_alpha, t, log_x_start.shape)
+        log_probs = log_add_exp(
+            log_x_start + log_cumprod_alpha_t,
+            log_1_min_cumprod_alpha - math.log(self.num_classes)
+        )
+        return log_probs
+    def log_sample_categorical(self, logits):
+        uniform = torch.rand_like(logits)
+        gumbel_noise = -torch.log(-torch.log(uniform + 1e-30) + 1e-30)
+        sample = (gumbel_noise + logits).argmax(dim=1)
+        log_sample = index_to_log_onehot(sample, self.num_classes)
+        return log_sample
+    def q_sample(self, log_x_start, t):
+        t = self.change_times_dtype(t)  # caused by continuous timesteps.
+        log_EV_qxt_x0 = self.q_pred(log_x_start, t)
+        log_sample = self.log_sample_categorical(log_EV_qxt_x0)
+        return log_sample
+    def q_pred_one_timestep(self, log_x_t, t):
+        log_alpha_t = extract(self.log_alpha, t, log_x_t.shape)
+        log_1_min_alpha_t = extract(self.log_1_min_alpha, t, log_x_t.shape)
+        # alpha_t * E[xt] + (1 - alpha_t) 1 / K
+        log_probs = log_add_exp(
+            log_x_t + log_alpha_t,
+            log_1_min_alpha_t - math.log(self.num_classes)
+        )
+        return log_probs
+    def q_posterior(self, log_x_start, log_x_t, t):
+        t = self.change_times_dtype(t)  # caused by continuous timesteps.
+        # q(xt-1 | xt, x0) = q(xt | xt-1, x0) * q(xt-1 | x0) / q(xt | x0)
+        # where q(xt | xt-1, x0) = q(xt | xt-1).
+        # EV_log_qxt_x0 = self.q_pred(log_x_start, t)
+        # print('sum exp', EV_log_qxt_x0.exp().sum(1).mean())
+        # assert False
+        # log_qxt_x0 = (log_x_t.exp() * EV_log_qxt_x0).sum(dim=1)
+        t_minus_1 = t - 1
+        # Remove negative values, will not be used anyway for final decoder
+        t_minus_1 = torch.where(t_minus_1 < 0, torch.zeros_like(t_minus_1), t_minus_1)
+        log_EV_qxtmin_x0 = self.q_pred(log_x_start, t_minus_1)
+        num_axes = (1,) * (len(log_x_start.size()) - 1)
+        t_broadcast = t.view(-1, *num_axes) * torch.ones_like(log_x_start)
+        log_EV_qxtmin_x0 = torch.where(t_broadcast == 0, log_x_start, log_EV_qxtmin_x0)
+        # unnormed_logprobs = log_EV_qxtmin_x0 +
+        #                     log q_pred_one_timestep(x_t, t)
+        # Note: _NOT_ x_tmin1, which is how the formula is typically used!!!
+        # Not very easy to see why this is true. But it is :)
+        unnormed_logprobs = log_EV_qxtmin_x0 + self.q_pred_one_timestep(log_x_t, t)
+        log_EV_xtmin_given_xt_given_xstart = \
+            unnormed_logprobs - torch.logsumexp(unnormed_logprobs, dim=1, keepdim=True)
+        return log_EV_xtmin_given_xt_given_xstart
+    def q_sample_from_to(self, log_x_from, from_t, to_t):
+        shape, device, dtype = log_x_from.shape, log_x_from.device, log_x_from.dtype
+        batch = shape[0]
+        if isinstance(from_t, float):
+            from_t = torch.full((batch,), from_t, device=device, dtype=dtype)
+        if isinstance(to_t, float):
+            to_t = torch.full((batch,), to_t, device=device, dtype=dtype)
+        from_t = self.change_times_dtype(from_t)  # caused by continuous timesteps.
+        to_t = self.change_times_dtype(to_t)  # caused by continuous timesteps.
+        log_cumprod_alpha_to_t = extract(self.log_cumprod_alpha, to_t, log_x_from.shape)
+        log_cumprod_alpha_from_t = extract(self.log_cumprod_alpha, from_t, log_x_from.shape)
+        log_probs = log_add_exp(
+            log_x_from + log_cumprod_alpha_to_t - log_cumprod_alpha_from_t,
+            log_1_min_a(log_cumprod_alpha_to_t - log_cumprod_alpha_from_t) - math.log(self.num_classes)
+        )
+        mask = (to_t == torch.zeros_like(to_t)).float()[:, None, None, None]
+        log_sample = index_to_log_onehot(log_probs.argmax(dim=1), self.num_classes) * mask \
+            + self.log_sample_categorical(log_probs) * (1. - mask)
+        return log_sample
+    def predict_start_from_noise(self, x_t, t, noise):
+        raise NotImplementedError
+    # calculate loss
+    def multinomial_kl(self, log_prob1, log_prob2):
+        return (log_prob1.exp() * (log_prob1 - log_prob2)).sum(dim=1)
+    def kl_prior(self, log_x_start):
+        b = log_x_start.size(0)
+        device = log_x_start.device
+        ones = torch.ones(b, device=device).long()
+        log_qxT_prob = self.q_pred(log_x_start, t=(self.num_timesteps - 1) * ones)
+        log_half_prob = -torch.log(self.num_classes * torch.ones_like(log_qxT_prob))
+        kl_prior = self.multinomial_kl(log_qxT_prob, log_half_prob)
+        return sum_except_batch(kl_prior)
+    def loss_fn(self, target_log_lbl, pred_lbl, t, log_lbl):
+        t = self.change_times_dtype(t)
+        pt = torch.ones_like(t).float() / self.num_timesteps
+        kl = self.multinomial_kl(target_log_lbl, pred_lbl)
+        kl = sum_except_batch(kl)
+        decoder_nll = -log_categorical(log_lbl, pred_lbl)
+        decoder_nll = sum_except_batch(decoder_nll)
+        mask = (t == torch.zeros_like(t)).float()
+        kl = mask * decoder_nll + (1. - mask) * kl
+        kl_prior = self.kl_prior(log_lbl)
+        vb_loss = kl / pt + kl_prior
+        loss = vb_loss / (math.log(2) * pred_lbl.shape[1:].numel())
+        return loss
+class LabelEmbedding(nn.Module):
+    def __init__(self, num_classes, channels):
+        super().__init__()
+        self.emb_layer = nn.Embedding(num_classes, channels)
+    def forward(self, x):
+        assert x.dim() == 4, f'x.shape should be (B, 1, H, W) but {x.shape}'
+        assert x.size(1) == 1, f'x.shape should be (B, 1, H, W) but {x.shape}'
+        x = self.emb_layer(x.long().squeeze(1))
+        x = x.permute(0, 3, 1, 2)
+        return x
+class JointUnet(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        num_classes,
+        image_embed_dim=1024,
+        text_embed_dim=get_encoded_dim(DEFAULT_T5_NAME),
+        num_resnet_blocks=1,
+        cond_dim=None,
+        num_image_tokens=4,
+        num_time_tokens=2,
+        learned_sinu_pos_emb_dim=16,
+        out_dim=None,
+        dim_mults=(1, 2, 4, 8),
+        cond_images_channels=0,
+        channels=3,
+        channels_lbl=3,
+        channels_out=None,
+        attn_dim_head=64,
+        attn_heads=8,
+        ff_mult=2.,
+        lowres_cond=False,                # for cascading diffusion - https://cascaded-diffusion.github.io/
+        layer_attns=True,
+        layer_attns_depth=1,
+        # whether to condition the self-attention blocks with the text embeddings, as described in Appendix D.3.1
+        layer_attns_add_text_cond=True,
+        # whether to have a layer of attention at the bottleneck (can turn off for higher resolution in cascading DDPM, before bringing in efficient attention)
+        attend_at_middle=True,
+        layer_cross_attns=True,
+        use_linear_attn=False,
+        use_linear_cross_attn=False,
+        cond_on_text=True,
+        max_text_len=256,
+        init_dim=None,
+        resnet_groups=8,
+        init_conv_kernel_size=7,          # kernel size of initial conv, if not using cross embed
+        init_cross_embed=True,
+        init_cross_embed_kernel_sizes=(3, 7, 15),
+        cross_embed_downsample=False,
+        cross_embed_downsample_kernel_sizes=(2, 4),
+        attn_pool_text=True,
+        attn_pool_num_latents=32,
+        dropout=0.,
+        memory_efficient=False,
+        init_conv_to_final_conv_residual=False,
+        use_global_context_attn=True,
+        scale_skip_connection=True,
+        final_resnet_block=True,
+        final_conv_kernel_size=3,
+        cosine_sim_attn=False,
+        self_cond=False,
+        combine_upsample_fmaps=False,      # combine feature maps from all upsample blocks, used in unet squared successfully
+        pixel_shuffle_upsample=True        # may address checkboard artifacts
+    ):
+        super().__init__()
+        # guide researchers
+        assert attn_heads > 1, 'you need to have more than 1 attention head, ideally at least 4 or 8'
+        if dim < 128:
+            print_once('The base dimension of your u-net should ideally be no smaller than 128, as recommended by a professional DDPM trainer https://nonint.com/2022/05/04/friends-dont-let-friends-train-small-diffusion-models/')
+        # save locals to take care of some hyperparameters for cascading DDPM
+        self._locals = locals()
+        self._locals.pop('self', None)
+        self._locals.pop('__class__', None)
+        # determine dimensions
+        self.channels = channels
+        self.channels_out = default(channels_out, channels)
+        # label embedding
+        self.num_classes = num_classes
+        self.init_emb_seg = LabelEmbedding(self.num_classes, channels_lbl)
+        self.init_emb_seg_lowres = LabelEmbedding(self.num_classes, channels_lbl) if lowres_cond else None
+        # (1) in cascading diffusion, one concats the low resolution image, blurred, for conditioning the higher resolution synthesis
+        # (2) in self conditioning, one appends the predict x0 (x_start)
+        # (3) in joint diffusion, label condition appends on image.
+        init_channels = (channels + channels_lbl) * (1 + int(lowres_cond) + int(self_cond))  # Joint Imagen
+        init_dim = default(init_dim, dim)
+        self.self_cond = self_cond
+        if self_cond:
+            self.self_cond_lbl_emb = LabelEmbedding(self.num_classes, channels_lbl)
+        # optional image conditioning
+        self.has_cond_image = cond_images_channels > 0
+        self.cond_images_channels = cond_images_channels
+        init_channels += cond_images_channels
+        # initial convolution
+        self.init_conv = CrossEmbedLayer(init_channels, dim_out=init_dim, kernel_sizes=init_cross_embed_kernel_sizes, stride=1) if init_cross_embed else nn.Conv2d(
+            init_channels, init_dim, init_conv_kernel_size, padding=init_conv_kernel_size // 2)
+        dims = [init_dim, *map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        # time conditioning
+        cond_dim = default(cond_dim, dim)
+        time_cond_dim = dim * 4 * (2 if lowres_cond else 1)
+        # embedding time for log(snr) noise from continuous version
+        sinu_pos_emb = LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim)
+        sinu_pos_emb_input_dim = learned_sinu_pos_emb_dim + 1
+        self.to_time_hiddens = nn.Sequential(
+            sinu_pos_emb,
+            nn.Linear(sinu_pos_emb_input_dim, time_cond_dim),
+            nn.SiLU()
+        )
+        self.to_time_cond = nn.Sequential(
+            nn.Linear(time_cond_dim, time_cond_dim)
+        )
+        # project to time tokens as well as time hiddens
+        self.to_time_tokens = nn.Sequential(
+            nn.Linear(time_cond_dim, cond_dim * num_time_tokens),
+            Rearrange('b (r d) -> b r d', r=num_time_tokens)
+        )
+        # low res aug noise conditioning
+        self.lowres_cond = lowres_cond
+        if lowres_cond:
+            self.to_lowres_time_hiddens = nn.Sequential(
+                LearnedSinusoidalPosEmb(learned_sinu_pos_emb_dim),
+                nn.Linear(learned_sinu_pos_emb_dim + 1, time_cond_dim),
+                nn.SiLU()
+            )
+            self.to_lowres_time_cond = nn.Sequential(
+                nn.Linear(time_cond_dim, time_cond_dim)
+            )
+            self.to_lowres_time_tokens = nn.Sequential(
+                nn.Linear(time_cond_dim, cond_dim * num_time_tokens),
+                Rearrange('b (r d) -> b r d', r=num_time_tokens)
+            )
+        # normalizations
+        self.norm_cond = nn.LayerNorm(cond_dim)
+        # text encoding conditioning (optional)
+        self.text_to_cond = None
+        if cond_on_text:
+            assert exists(text_embed_dim), 'text_embed_dim must be given to the unet if cond_on_text is True'
+            self.text_to_cond = nn.Linear(text_embed_dim, cond_dim)
+        # finer control over whether to condition on text encodings
+        self.cond_on_text = cond_on_text
+        # attention pooling
+        self.attn_pool = PerceiverResampler(dim=cond_dim, depth=2, dim_head=attn_dim_head, heads=attn_heads,
+                                            num_latents=attn_pool_num_latents, cosine_sim_attn=cosine_sim_attn) if attn_pool_text else None
+        # for classifier free guidance
+        self.max_text_len = max_text_len
+        self.null_text_embed = nn.Parameter(torch.randn(1, max_text_len, cond_dim))
+        self.null_text_hidden = nn.Parameter(torch.randn(1, time_cond_dim))
+        # for non-attention based text conditioning at all points in the network where time is also conditioned
+        self.to_text_non_attn_cond = None
+        if cond_on_text:
+            self.to_text_non_attn_cond = nn.Sequential(
+                nn.LayerNorm(cond_dim),
+                nn.Linear(cond_dim, time_cond_dim),
+                nn.SiLU(),
+                nn.Linear(time_cond_dim, time_cond_dim)
+            )
+        # attention related params
+        attn_kwargs = dict(heads=attn_heads, dim_head=attn_dim_head, cosine_sim_attn=cosine_sim_attn)
+        num_layers = len(in_out)
+        # resnet block klass
+        num_resnet_blocks = cast_tuple(num_resnet_blocks, num_layers)
+        resnet_groups = cast_tuple(resnet_groups, num_layers)
+        resnet_klass = partial(ResnetBlock, **attn_kwargs)
+        layer_attns = cast_tuple(layer_attns, num_layers)
+        layer_attns_depth = cast_tuple(layer_attns_depth, num_layers)
+        layer_cross_attns = cast_tuple(layer_cross_attns, num_layers)
+        use_linear_attn = cast_tuple(use_linear_attn, num_layers)
+        use_linear_cross_attn = cast_tuple(use_linear_cross_attn, num_layers)
+        assert all([layers == num_layers for layers in list(map(len, (resnet_groups, layer_attns, layer_cross_attns)))])
+        # downsample klass
+        downsample_klass = Downsample
+        if cross_embed_downsample:
+            downsample_klass = partial(CrossEmbedLayer, kernel_sizes=cross_embed_downsample_kernel_sizes)
+        # initial resnet block (for memory efficient unet)
+        self.init_resnet_block = resnet_klass(init_dim, init_dim, time_cond_dim=time_cond_dim,
+                                              groups=resnet_groups[0], use_gca=use_global_context_attn) if memory_efficient else None
+        # scale for resnet skip connections
+        self.skip_connect_scale = 1. if not scale_skip_connection else (2 ** -0.5)
+        # layers
+        self.downs = nn.ModuleList([])
+        self.ups = nn.ModuleList([])
+        num_resolutions = len(in_out)
+        layer_params = [num_resnet_blocks, resnet_groups, layer_attns,
+                        layer_attns_depth, layer_cross_attns, use_linear_attn, use_linear_cross_attn]
+        reversed_layer_params = list(map(reversed, layer_params))
+        # downsampling layers
+        skip_connect_dims = []  # keep track of skip connection dimensions
+        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn, layer_attn_depth, layer_cross_attn, layer_use_linear_attn, layer_use_linear_cross_attn) in enumerate(zip(in_out, *layer_params)):
+            is_last = ind >= (num_resolutions - 1)
+            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None
+            if layer_attn:
+                transformer_block_klass = TransformerBlock
+            elif layer_use_linear_attn:
+                transformer_block_klass = LinearAttentionTransformerBlock
+            else:
+                transformer_block_klass = Identity
+            current_dim = dim_in
+            # whether to pre-downsample, from memory efficient unet
+            pre_downsample = None
+            if memory_efficient:
+                pre_downsample = downsample_klass(dim_in, dim_out)
+                current_dim = dim_out
+            skip_connect_dims.append(current_dim)
+            # whether to do post-downsample, for non-memory efficient unet
+            post_downsample = None
+            if not memory_efficient:
+                post_downsample = downsample_klass(current_dim, dim_out) if not is_last else Parallel(
+                    nn.Conv2d(dim_in, dim_out, 3, padding=1), nn.Conv2d(dim_in, dim_out, 1))
+            self.downs.append(nn.ModuleList([
+                pre_downsample,
+                resnet_klass(current_dim, current_dim, cond_dim=layer_cond_dim,
+                             linear_attn=layer_use_linear_cross_attn, time_cond_dim=time_cond_dim, groups=groups),
+                nn.ModuleList([ResnetBlock(current_dim, current_dim, time_cond_dim=time_cond_dim,
+                              groups=groups, use_gca=use_global_context_attn) for _ in range(layer_num_resnet_blocks)]),
+                transformer_block_klass(dim=current_dim, depth=layer_attn_depth,
+                                        ff_mult=ff_mult, context_dim=cond_dim, **attn_kwargs),
+                post_downsample
+            ]))
+        # middle layers
+        mid_dim = dims[-1]
+        self.mid_block1 = ResnetBlock(mid_dim, mid_dim, cond_dim=cond_dim,
+                                      time_cond_dim=time_cond_dim, groups=resnet_groups[-1])
+        self.mid_attn = EinopsToAndFrom('b c h w', 'b (h w) c', Residual(
+            Attention(mid_dim, **attn_kwargs))) if attend_at_middle else None
+        self.mid_block2 = ResnetBlock(mid_dim, mid_dim, cond_dim=cond_dim,
+                                      time_cond_dim=time_cond_dim, groups=resnet_groups[-1])
+        # upsample klass
+        upsample_klass = Upsample if not pixel_shuffle_upsample else PixelShuffleUpsample
+        # upsampling layers
+        upsample_fmap_dims = []
+        for ind, ((dim_in, dim_out), layer_num_resnet_blocks, groups, layer_attn, layer_attn_depth, layer_cross_attn, layer_use_linear_attn, layer_use_linear_cross_attn) in enumerate(zip(reversed(in_out), *reversed_layer_params)):
+            is_last = ind == (len(in_out) - 1)
+            layer_cond_dim = cond_dim if layer_cross_attn or layer_use_linear_cross_attn else None
+            if layer_attn:
+                transformer_block_klass = TransformerBlock
+            elif layer_use_linear_attn:
+                transformer_block_klass = LinearAttentionTransformerBlock
+            else:
+                transformer_block_klass = Identity
+            skip_connect_dim = skip_connect_dims.pop()
+            upsample_fmap_dims.append(dim_out)
+            self.ups.append(nn.ModuleList([
+                resnet_klass(dim_out + skip_connect_dim, dim_out, cond_dim=layer_cond_dim,
+                             linear_attn=layer_use_linear_cross_attn, time_cond_dim=time_cond_dim, groups=groups),
+                nn.ModuleList([ResnetBlock(dim_out + skip_connect_dim, dim_out, time_cond_dim=time_cond_dim,
+                              groups=groups, use_gca=use_global_context_attn) for _ in range(layer_num_resnet_blocks)]),
+                transformer_block_klass(dim=dim_out, depth=layer_attn_depth, ff_mult=ff_mult,
+                                        context_dim=cond_dim, **attn_kwargs),
+                upsample_klass(dim_out, dim_in) if not is_last or memory_efficient else Identity(),
+            ]))
+        # whether to combine feature maps from all upsample blocks before final resnet block out
+        self.upsample_combiner = UpsampleCombiner(
+            dim=dim,
+            enabled=combine_upsample_fmaps,
+            dim_ins=upsample_fmap_dims,
+            dim_outs=dim
+        )
+        # whether to do a final residual from initial conv to the final resnet block out
+        self.init_conv_to_final_conv_residual = init_conv_to_final_conv_residual
+        final_conv_dim = self.upsample_combiner.dim_out + (dim if init_conv_to_final_conv_residual else 0)
+        # final optional resnet block and convolution out
+        self.final_res_block = ResnetBlock(final_conv_dim, dim, time_cond_dim=time_cond_dim,
+                                           groups=resnet_groups[0], use_gca=True) if final_resnet_block else None
+        final_conv_dim_in = dim if final_resnet_block else final_conv_dim
+        final_conv_dim_in += (channels + channels_lbl) if lowres_cond else 0
+        self.final_conv = nn.Conv2d(final_conv_dim_in, self.channels_out,
+                                    final_conv_kernel_size, padding=final_conv_kernel_size // 2)
+        self.final_conv_seg = nn.Conv2d(final_conv_dim_in, self.num_classes,
+                                        final_conv_kernel_size, padding=final_conv_kernel_size // 2)
+        zero_init_(self.final_conv)
+        zero_init_(self.final_conv_seg)
+    # if the current settings for the unet are not correct
+    # for cascading DDPM, then reinit the unet with the right settings
+    def cast_model_parameters(
+        self,
+        *,
+        lowres_cond,
+        text_embed_dim,
+        channels,
+        channels_out,
+        cond_on_text
+    ):
+        if lowres_cond == self.lowres_cond and \
+                channels == self.channels and \
+                cond_on_text == self.cond_on_text and \
+                text_embed_dim == self._locals['text_embed_dim'] and \
+                channels_out == self.channels_out:
+            return self
+        updated_kwargs = dict(
+            lowres_cond=lowres_cond,
+            text_embed_dim=text_embed_dim,
+            channels=channels,
+            channels_out=channels_out,
+            cond_on_text=cond_on_text
+        )
+        return self.__class__(**{**self._locals, **updated_kwargs})
+    # methods for returning the full unet config as well as its parameter state
+    def to_config_and_state_dict(self):
+        return self._locals, self.state_dict()
+    # class method for rehydrating the unet from its config and state dict
+    @classmethod
+    def from_config_and_state_dict(klass, config, state_dict):
+        unet = klass(**config)
+        unet.load_state_dict(state_dict)
+        return unet
+    # methods for persisting unet to disk
+    def persist_to_file(self, path):
+        path = Path(path)
+        path.parents[0].mkdir(exist_ok=True, parents=True)
+        config, state_dict = self.to_config_and_state_dict()
+        pkg = dict(config=config, state_dict=state_dict)
+        torch.save(pkg, str(path))
+    # class method for rehydrating the unet from file saved with `persist_to_file`
+    @classmethod
+    def hydrate_from_file(klass, path):
+        path = Path(path)
+        assert path.exists()
+        pkg = torch.load(str(path))
+        assert 'config' in pkg and 'state_dict' in pkg
+        config, state_dict = pkg['config'], pkg['state_dict']
+        return JointUnet.from_config_and_state_dict(config, state_dict)
+    # forward with classifier free guidance
+    def forward_with_cond_scale(
+        self,
+        *args,
+        cond_scale=1.,
+        **kwargs
+    ):
+        logits, logits_seg = self.forward(*args, **kwargs)
+        if cond_scale == 1:
+            return logits, logits_seg
+        null_logits, null_logits_seg = self.forward(*args, cond_drop_prob=1., **kwargs)
+        cond_logits = null_logits + (logits - null_logits) * cond_scale
+        # TODO: CFG of categorical is not clear.
+        cond_logits_seg = null_logits_seg + (logits_seg - null_logits_seg) * cond_scale
+        return cond_logits, cond_logits_seg
+    def forward(
+        self,
+        x,
+        lbl,
+        time,
+        *,
+        lowres_cond_img=None,
+        lowres_cond_lbl=None,
+        lowres_noise_times=None,
+        text_embeds=None,
+        text_mask=None,
+        cond_images=None,
+        self_cond=None,
+        self_cond_lbl=None,
+        cond_drop_prob=0.
+    ):
+        batch_size, device = x.shape[0], x.device
+        # joint imagen
+        lbl = self.init_emb_seg(lbl.long())
+        x = torch.cat((x, lbl), dim=1)
+        # condition on self
+        if self.self_cond:
+            self_cond = default(self_cond, lambda: torch.zeros_like(x))
+            if self_cond_lbl is None:
+                self_cond_lbl = torch.zeros_like(lbl)
+            else:
+                self_cond_lbl = self.self_cond_lbl_emb(self_cond_lbl.long())
+            x = torch.cat((x, self_cond, self_cond_lbl), dim=1)
+        # add low resolution conditioning, if present
+        assert not (self.lowres_cond and not exists(lowres_cond_img)
+                    ), 'low resolution conditioning image must be present'
+        assert not (self.lowres_cond and not exists(lowres_noise_times)
+                    ), 'low resolution conditioning noise time must be present'
+        if exists(lowres_cond_img) and exists(lowres_cond_lbl):
+            lowres_cond_lbl = self.init_emb_seg_lowres(lowres_cond_lbl.long())
+            x = torch.cat((x, lowres_cond_img, lowres_cond_lbl), dim=1)
+        # condition on input image
+        assert not (self.has_cond_image ^ exists(cond_images)), \
+            'you either requested to condition on an image on the unet, but the conditioning image is not supplied, or vice versa'
+        if exists(cond_images):
+            assert cond_images.shape[1] == self.cond_images_channels, 'the number of channels on the conditioning image you are passing in does not match what you specified on initialiation of the unet'
+            cond_images = resize_image_to(cond_images, x.shape[-1])
+            x = torch.cat((cond_images, x), dim=1)
+        # initial convolution
+        x = self.init_conv(x)
+        # init conv residual
+        if self.init_conv_to_final_conv_residual:
+            init_conv_residual = x.clone()
+        # time conditioning
+        time_hiddens = self.to_time_hiddens(time)
+        # derive time tokens
+        time_tokens = self.to_time_tokens(time_hiddens)
+        t = self.to_time_cond(time_hiddens)
+        # add lowres time conditioning to time hiddens
+        # and add lowres time tokens along sequence dimension for attention
+        if self.lowres_cond:
+            lowres_time_hiddens = self.to_lowres_time_hiddens(lowres_noise_times)
+            lowres_time_tokens = self.to_lowres_time_tokens(lowres_time_hiddens)
+            lowres_t = self.to_lowres_time_cond(lowres_time_hiddens)
+            t = t + lowres_t
+            time_tokens = torch.cat((time_tokens, lowres_time_tokens), dim=-2)
+        # text conditioning
+        text_tokens = None
+        if exists(text_embeds) and self.cond_on_text:
+            # conditional dropout
+            text_keep_mask = prob_mask_like((batch_size,), 1 - cond_drop_prob, device=device)
+            text_keep_mask_embed = rearrange(text_keep_mask, 'b -> b 1 1')
+            text_keep_mask_hidden = rearrange(text_keep_mask, 'b -> b 1')
+            # calculate text embeds
+            text_tokens = self.text_to_cond(text_embeds)
+            text_tokens = text_tokens[:, :self.max_text_len]
+            if exists(text_mask):
+                text_mask = text_mask[:, :self.max_text_len]
+            text_tokens_len = text_tokens.shape[1]
+            remainder = self.max_text_len - text_tokens_len
+            if remainder > 0:
+                text_tokens = F.pad(text_tokens, (0, 0, 0, remainder))
+            if exists(text_mask):
+                if remainder > 0:
+                    text_mask = F.pad(text_mask, (0, remainder), value=False)
+                text_mask = rearrange(text_mask, 'b n -> b n 1')
+                text_keep_mask_embed = text_mask & text_keep_mask_embed
+            null_text_embed = self.null_text_embed.to(text_tokens.dtype)  # for some reason pytorch AMP not working
+            text_tokens = torch.where(
+                text_keep_mask_embed,
+                text_tokens,
+                null_text_embed
+            )
+            if exists(self.attn_pool):
+                text_tokens = self.attn_pool(text_tokens)
+            # extra non-attention conditioning by projecting and then summing text embeddings to time
+            # termed as text hiddens
+            mean_pooled_text_tokens = text_tokens.mean(dim=-2)
+            text_hiddens = self.to_text_non_attn_cond(mean_pooled_text_tokens)
+            null_text_hidden = self.null_text_hidden.to(t.dtype)
+            text_hiddens = torch.where(
+                text_keep_mask_hidden,
+                text_hiddens,
+                null_text_hidden
+            )
+            t = t + text_hiddens
+        # main conditioning tokens (c)
+        c = time_tokens if not exists(text_tokens) else torch.cat((time_tokens, text_tokens), dim=-2)
+        # normalize conditioning tokens
+        c = self.norm_cond(c)
+        # initial resnet block (for memory efficient unet)
+        if exists(self.init_resnet_block):
+            x = self.init_resnet_block(x, t)
+        # go through the layers of the unet, down and up
+        hiddens = []
+        for pre_downsample, init_block, resnet_blocks, attn_block, post_downsample in self.downs:
+            if exists(pre_downsample):
+                x = pre_downsample(x)
+            x = init_block(x, t, c)
+            for resnet_block in resnet_blocks:
+                x = resnet_block(x, t)
+                hiddens.append(x)
+            x = attn_block(x, c)
+            hiddens.append(x)
+            if exists(post_downsample):
+                x = post_downsample(x)
+        x = self.mid_block1(x, t, c)
+        if exists(self.mid_attn):
+            x = self.mid_attn(x)
+        x = self.mid_block2(x, t, c)
+        def add_skip_connection(x): return torch.cat((x, hiddens.pop() * self.skip_connect_scale), dim=1)
+        up_hiddens = []
+        for init_block, resnet_blocks, attn_block, upsample in self.ups:
+            x = add_skip_connection(x)
+            x = init_block(x, t, c)
+            for resnet_block in resnet_blocks:
+                x = add_skip_connection(x)
+                x = resnet_block(x, t)
+            x = attn_block(x, c)
+            up_hiddens.append(x.contiguous())
+            x = upsample(x)
+        # whether to combine all feature maps from upsample blocks
+        x = self.upsample_combiner(x, up_hiddens)
+        # final top-most residual if needed
+        if self.init_conv_to_final_conv_residual:
+            x = torch.cat((x, init_conv_residual), dim=1)
+        if exists(self.final_res_block):
+            x = self.final_res_block(x, t)
+        if exists(lowres_cond_img) and exists(lowres_cond_lbl):
+            x = torch.cat((x, lowres_cond_img, lowres_cond_lbl), dim=1)
+        return self.final_conv(x), self.final_conv_seg(x)
+# predefined unets, with configs lining up with hyperparameters in appendix of paper
+class BaseJointUnet(JointUnet):
+    def __init__(self, *args, **kwargs):
+        default_kwargs = dict(
+            dim=128,
+            dim_mults=(1, 2, 4, 8),
+            num_resnet_blocks=(2, 4, 8, 8),
+            layer_attns=(False, False, False, True),
+            layer_cross_attns=(False, False, False, True),
+            attn_heads=8,
+            ff_mult=2.,
+            memory_efficient=True
+        )
+        super().__init__(*args, **{**default_kwargs, **kwargs})
+class SRJointUnet(JointUnet):
+    def __init__(self, *args, **kwargs):
+        default_kwargs = dict(
+            dim=128,
+            dim_mults=(1, 2, 4, 8),
+            num_resnet_blocks=(2, 4, 8, 8),
+            layer_attns=False,
+            layer_cross_attns=(False, False, False, True),
+            attn_heads=8,
+            ff_mult=2.,
+            memory_efficient=True
+        )
+        super().__init__(*args, **{**default_kwargs, **kwargs})
+# main imagen ddpm class, which is a cascading DDPM from Ho et al.
+class JointImagen(nn.Module):
+    def __init__(
+        self,
+        unets,
+        *,
+        image_sizes,                                # for cascading ddpm, image size at each stage
+        num_classes,
+        text_encoder_name=DEFAULT_T5_NAME,
+        text_embed_dim=None,
+        channels=3,
+        timesteps=1000,
+        sample_timesteps=100,
+        cond_drop_prob=0.1,
+        loss_type='l2',
+        noise_schedules='cosine',
+        noise_schedules_lbl='cosine_p',
+        cosine_p_lbl=1.0,
+        pred_objectives='noise',
+        random_crop_sizes=None,
+        lowres_noise_schedule='linear',
+        # in the paper, they present a new trick where they noise the lowres conditioning image, and at sample time, fix it to a certain level (0.1 or 0.3) - the unets are also made to be conditioned on this noise level
+        lowres_sample_noise_level=0.2,
+        # unclear when conditioning on augmentation noise level, whether each batch element receives a random aug noise value - turning off due to @marunine's find
+        per_sample_random_aug_noise_level=False,
+        lowres_max_thres=0.999,
+        condition_on_text=True,
+        # whether to take care of normalizing the image from [0, 1] to [-1, 1] and back automatically - you can turn this off if you want to pass in the [-1, 1] ranged image yourself from the dataloader
+        auto_normalize_img=True,
+        # p2 loss weight, from https://arxiv.org/abs/2204.00227 - 0 is equivalent to weight of 1 across time
+        p2_loss_weight_gamma=0.5,
+        p2_loss_weight_k=1,
+        dynamic_thresholding=True,
+        dynamic_thresholding_percentile=0.95,     # unsure what this was based on perusal of paper
+        only_train_unet_number=None,
+    ):
+        super().__init__()
+        # joint
+        self.num_classes = num_classes
+        # loss
+        if loss_type == 'l1':
+            loss_fn = F.l1_loss
+        elif loss_type == 'l2':
+            loss_fn = F.mse_loss
+        elif loss_type == 'huber':
+            loss_fn = F.smooth_l1_loss
+        else:
+            raise NotImplementedError()
+        self.loss_type = loss_type
+        self.loss_fn = loss_fn
+        # conditioning hparams
+        self.condition_on_text = condition_on_text
+        self.unconditional = not condition_on_text
+        # channels
+        self.channels = channels
+        # automatically take care of ensuring that first unet is unconditional
+        # while the rest of the unets are conditioned on the low resolution image produced by previous unet
+        unets = cast_tuple(unets)
+        num_unets = len(unets)
+        # determine noise schedules per unet
+        timesteps = cast_tuple(timesteps, num_unets)
+        sample_timesteps = cast_tuple(sample_timesteps, num_unets)
+        # make sure noise schedule defaults to 'cosine', 'cosine', and then 'linear' for rest of super-resoluting unets
+        noise_schedules = cast_tuple(noise_schedules)
+        noise_schedules = pad_tuple_to_length(noise_schedules, 2, 'cosine')
+        noise_schedules = pad_tuple_to_length(noise_schedules, num_unets, 'linear')
+        noise_schedules_lbl = cast_tuple(noise_schedules_lbl)
+        noise_schedules_lbl = pad_tuple_to_length(noise_schedules_lbl, 2, 'cosine_p')
+        noise_schedules_lbl = pad_tuple_to_length(noise_schedules_lbl, num_unets, 'linear')
+        # construct noise schedulers
+        noise_scheduler_klass = GaussianDiffusionContinuousTimes
+        noise_scheduler_lbl_klass = MultinomialDiffusion
+        self.noise_schedulers = nn.ModuleList([])
+        self.noise_schedulers_lbl = nn.ModuleList([])
+        for timestep, noise_schedule, noise_schedule_lbl in zip(timesteps, noise_schedules, noise_schedules_lbl):
+            noise_scheduler = noise_scheduler_klass(noise_schedule=noise_schedule, timesteps=timestep)
+            self.noise_schedulers.append(noise_scheduler)
+            noise_scheduler_lbl = noise_scheduler_lbl_klass(
+                num_classes, noise_schedule=noise_schedule_lbl, timesteps=timestep, p=cosine_p_lbl)
+            self.noise_schedulers_lbl.append(noise_scheduler_lbl)
+        self.noise_schedulers_sample = nn.ModuleList([])
+        self.noise_schedulers_lbl_sample = nn.ModuleList([])
+        for sample_timestep, noise_schedule, noise_schedule_lbl in zip(sample_timesteps, noise_schedules, noise_schedules_lbl):
+            noise_scheduler_sample = noise_scheduler_klass(noise_schedule=noise_schedule, timesteps=sample_timestep)
+            self.noise_schedulers_sample.append(noise_scheduler_sample)
+            noise_scheduler_lbl_sample = noise_scheduler_lbl_klass(
+                num_classes, noise_schedule=noise_schedule_lbl, timesteps=sample_timestep, p=cosine_p_lbl)
+            self.noise_schedulers_lbl_sample.append(noise_scheduler_lbl_sample)
+        # randomly cropping for upsampler training
+        self.random_crop_sizes = cast_tuple(random_crop_sizes, num_unets)
+        assert all(map(lambda x: x is None or (isinstance(x, (tuple, list)) and len(x) == 2), self.random_crop_sizes))
+        assert not exists(first(self.random_crop_sizes)), \
+            'you should not need to randomly crop image during training for base unet, only for upsamplers '\
+            '- so pass in `random_crop_sizes = (None, 128, 256)` as example'
+        # lowres augmentation noise schedule
+        self.lowres_noise_schedule = GaussianDiffusionContinuousTimes(noise_schedule=lowres_noise_schedule)
+        self.lowres_noise_schedule_lbl = MultinomialDiffusion(
+            num_classes, noise_schedule=lowres_noise_schedule, p=cosine_p_lbl)
+        # ddpm objectives - predicting noise by default
+        self.pred_objectives = cast_tuple(pred_objectives, num_unets)
+        # get text encoder
+        self.text_encoder_name = text_encoder_name
+        self.text_embed_dim = default(text_embed_dim, lambda: get_encoded_dim(text_encoder_name))
+        self.encode_text = partial(t5_encode_text, name=text_encoder_name)
+        # construct unets
+        self.unets = nn.ModuleList([])
+        self.unet_being_trained_index = -1  # keeps track of which unet is being trained at the moment
+        self.only_train_unet_number = only_train_unet_number
+        for ind, one_unet in enumerate(unets):
+            assert isinstance(one_unet, (JointUnet, Unet3D, NullUnet))
+            is_first = ind == 0
+            one_unet = one_unet.cast_model_parameters(
+                lowres_cond=not is_first,
+                cond_on_text=self.condition_on_text,
+                text_embed_dim=self.text_embed_dim if self.condition_on_text else None,
+                channels=self.channels,
+                channels_out=self.channels
+            )
+            self.unets.append(one_unet)
+        # unet image sizes
+        self.image_sizes = cast_tuple(image_sizes)
+        assert all(map(lambda x: isinstance(x, (tuple, list)) and len(x) == 2, self.image_sizes))
+        assert num_unets == len(self.image_sizes), \
+            f'you did not supply the correct number of u-nets ({len(unets)}) for resolutions {self.image_sizes}'
+        self.sample_channels = cast_tuple(self.channels, num_unets)
+        # determine whether we are training on images or video
+        is_video = any([isinstance(unet, Unet3D) for unet in self.unets])
+        self.is_video = is_video
+        self.right_pad_dims_to_datatype = partial(rearrange, pattern=(
+            'b -> b 1 1 1' if not is_video else 'b -> b 1 1 1 1'))
+        self.resize_to = resize_video_to if is_video else resize_image_to
+        # cascading ddpm related stuff
+        lowres_conditions = tuple(map(lambda t: t.lowres_cond, self.unets))
+        assert lowres_conditions == (False, *((True,) * (num_unets - 1))), \
+            'the first unet must be unconditioned (by low resolution image), ' \
+            'and the rest of the unets must have `lowres_cond` set to True'
+        self.lowres_sample_noise_level = lowres_sample_noise_level
+        self.per_sample_random_aug_noise_level = per_sample_random_aug_noise_level
+        self.lowres_max_thres = lowres_max_thres
+        # classifier free guidance
+        self.cond_drop_prob = cond_drop_prob
+        self.can_classifier_guidance = cond_drop_prob > 0.
+        # normalize and unnormalize image functions
+        self.normalize_img = normalize_neg_one_to_one if auto_normalize_img else identity
+        self.unnormalize_img = unnormalize_zero_to_one if auto_normalize_img else identity
+        self.input_image_range = (0. if auto_normalize_img else -1., 1.)
+        # dynamic thresholding
+        self.dynamic_thresholding = cast_tuple(dynamic_thresholding, num_unets)
+        self.dynamic_thresholding_percentile = dynamic_thresholding_percentile
+        # p2 loss weight
+        self.p2_loss_weight_k = p2_loss_weight_k
+        self.p2_loss_weight_gamma = cast_tuple(p2_loss_weight_gamma, num_unets)
+        assert all([(gamma_value <= 2) for gamma_value in self.p2_loss_weight_gamma]), \
+            'in paper, they noticed any gamma greater than 2 is harmful'
+        # one temp parameter for keeping track of device
+        self.register_buffer('_temp', torch.tensor([0.]), persistent=False)
+        # default to device of unets passed in
+        self.to(next(self.unets.parameters()).device)
+    def force_unconditional_(self):
+        self.condition_on_text = False
+        self.unconditional = True
+        for unet in self.unets:
+            unet.cond_on_text = False
+    @property
+    def device(self):
+        return self._temp.device
+    def get_unet(self, unet_number):
+        assert 0 < unet_number <= len(self.unets)
+        index = unet_number - 1
+        if isinstance(self.unets, nn.ModuleList):
+            unets_list = [unet for unet in self.unets]
+            delattr(self, 'unets')
+            self.unets = unets_list
+        if index != self.unet_being_trained_index:
+            for unet_index, unet in enumerate(self.unets):
+                unet.to(self.device if unet_index == index else 'cpu')
+        self.unet_being_trained_index = index
+        return self.unets[index]
+    def reset_unets_all_one_device(self, device=None):
+        device = default(device, self.device)
+        self.unets = nn.ModuleList([*self.unets])
+        self.unets.to(device)
+        self.unet_being_trained_index = -1
+    @contextmanager
+    def one_unet_in_gpu(self, unet_number=None, unet=None):
+        assert exists(unet_number) ^ exists(unet)
+        if exists(unet_number):
+            unet = self.unets[unet_number - 1]
+        devices = [module_device(unet) for unet in self.unets]
+        self.unets.cpu()
+        unet.to(self.device)
+        yield
+        for unet, device in zip(self.unets, devices):
+            unet.to(device)
+    # overriding state dict functions
+    def state_dict(self, *args, **kwargs):
+        self.reset_unets_all_one_device()
+        return super().state_dict(*args, **kwargs)
+    def load_state_dict(self, *args, **kwargs):
+        self.reset_unets_all_one_device()
+        return super().load_state_dict(*args, **kwargs)
+    # gaussian diffusion methods
+    def p_mean_variance(
+        self,
+        unet: JointUnet,
+        x,
+        log_lbl,
+        t,
+        *,
+        noise_scheduler: GaussianDiffusionContinuousTimes,
+        noise_scheduler_lbl: MultinomialDiffusion,
+        text_embeds=None,
+        text_mask=None,
+        cond_images=None,
+        lowres_cond_img=None,
+        lowres_cond_lbl=None,
+        self_cond=None,
+        self_cond_lbl=None,
+        lowres_noise_times=None,
+        cond_scale=1.,
+        model_output=None,
+        t_next=None,
+        pred_objective='noise',
+        dynamic_threshold=True,
+    ):
+        assert not (cond_scale != 1. and not self.can_classifier_guidance), 'imagen was not trained with conditional dropout, and thus one cannot use classifier free guidance (cond_scale anything other than 1)'
+        lbl = log_onehot_to_index(log_lbl)
+        pred, pred_lbl = default(model_output, lambda: unet.forward_with_cond_scale(
+            x, lbl, noise_scheduler.get_condition(t),
+            text_embeds=text_embeds, text_mask=text_mask,
+            cond_images=cond_images, cond_scale=cond_scale,
+            lowres_cond_img=lowres_cond_img, lowres_cond_lbl=lowres_cond_lbl,
+            self_cond=self_cond, self_cond_lbl=self_cond_lbl,
+            lowres_noise_times=self.lowres_noise_schedule.get_condition(lowres_noise_times)))
+        pred_lbl = F.log_softmax(pred_lbl, dim=1)
+        pred_lbl = noise_scheduler_lbl.q_posterior(pred_lbl, log_lbl, t)
+        if pred_objective == 'noise':
+            x_start = noise_scheduler.predict_start_from_noise(x, t=t, noise=pred)
+            # lbl_start = noise_scheduler_lbl.predict_start_from_noise(log_lbl, t=t, noise=pred_lbl) # TODO ???
+        elif pred_objective == 'x_start':
+            x_start = pred
+            # lbl_start = pred_lbl
+        else:
+            raise ValueError(f'unknown objective {pred_objective}')
+        lbl_start = None
+        if dynamic_threshold:
+            # following pseudocode in appendix
+            # s is the dynamic threshold, determined by percentile of absolute values of reconstructed sample per batch element
+            s = torch.quantile(
+                rearrange(x_start, 'b ... -> b (...)').abs(),
+                self.dynamic_thresholding_percentile,
+                dim=-1
+            )
+            s.clamp_(min=1.)
+            s = right_pad_dims_to(x_start, s)
+            x_start = x_start.clamp(-s, s) / s
+        else:
+            x_start.clamp_(-1., 1.)
+        mean_and_variance = noise_scheduler.q_posterior(x_start=x_start, x_t=x, t=t, t_next=t_next)
+        log_lbl = noise_scheduler_lbl.log_sample_categorical(pred_lbl)
+        return mean_and_variance, log_lbl, x_start, lbl_start
+    @torch.no_grad()
+    def p_sample(
+        self,
+        unet,
+        x,
+        log_lbl,
+        t,
+        *,
+        noise_scheduler,
+        noise_scheduler_lbl,
+        t_next=None,
+        text_embeds=None,
+        text_mask=None,
+        cond_images=None,
+        cond_scale=1.,
+        self_cond=None,
+        self_cond_lbl=None,
+        lowres_cond_img=None,
+        lowres_cond_lbl=None,
+        lowres_noise_times=None,
+        pred_objective='noise',
+        dynamic_threshold=True,
+    ):
+        b, *_, device = *x.shape, x.device
+        (model_mean, _, model_log_variance), pred_lbl, x_start, lbl_start = self.p_mean_variance(
+            unet, x=x, log_lbl=log_lbl, t=t, t_next=t_next,
+            noise_scheduler=noise_scheduler, noise_scheduler_lbl=noise_scheduler_lbl,
+            text_embeds=text_embeds, text_mask=text_mask,
+            cond_images=cond_images, cond_scale=cond_scale,
+            lowres_cond_img=lowres_cond_img, lowres_cond_lbl=lowres_cond_lbl,
+            self_cond=self_cond, self_cond_lbl=self_cond_lbl,
+            lowres_noise_times=lowres_noise_times,
+            pred_objective=pred_objective, dynamic_threshold=dynamic_threshold)
+        noise = torch.randn_like(x)
+        # no noise when t == 0
+        is_last_sampling_timestep = (t_next == 0) if isinstance(
+            noise_scheduler, GaussianDiffusionContinuousTimes) else (t == 0)
+        nonzero_mask = (1 - is_last_sampling_timestep.float()).reshape(b, *((1,) * (len(x.shape) - 1)))
+        pred = model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+        return pred, pred_lbl, x_start, lbl_start
+    @torch.no_grad()
+    def p_sample_loop(
+        self,
+        unet,
+        shape,
+        *,
+        noise_scheduler: GaussianDiffusionContinuousTimes,
+        noise_scheduler_lbl: MultinomialDiffusion,
+        lowres_cond_img=None,
+        lowres_cond_lbl=None,
+        lowres_noise_times=None,
+        text_embeds=None,
+        text_mask=None,
+        cond_images=None,
+        inpaint_images=None,
+        inpaint_labels=None,
+        inpaint_masks=None,
+        inpaint_resample_times=5,
+        init_images=None,
+        init_labels=None,
+        skip_steps=None,
+        cond_scale=1,
+        pred_objective='noise',
+        dynamic_threshold=True,
+        use_tqdm=True
+    ):
+        assert init_labels is None, 'not implemented yet'
+        device = self.device
+        batch, _, h, w = shape
+        img = torch.randn(shape, device=device)
+        uniform_logits = torch.zeros((batch, self.num_classes) + (h, w), device=device)
+        log_lbl = noise_scheduler_lbl.log_sample_categorical(uniform_logits)
+        # for initialization with an image or video
+        if exists(init_images):
+            img += init_images
+            # TODO init_labels
+        # keep track of x0, for self conditioning
+        x_start = None
+        lbl_start = None
+        # prepare inpainting
+        has_inpainting = exists(inpaint_images) and exists(inpaint_labels) and exists(inpaint_masks)
+        resample_times = inpaint_resample_times if has_inpainting else 1
+        if has_inpainting:
+            assert inpaint_masks.shape[1] == 2, \
+                f'inpaint mask is a tuple of (mask_image, mask_label) but now:\n{inpaint_labels}'
+            inpaint_images = self.normalize_img(inpaint_images)
+            inpaint_images = self.resize_to(inpaint_images, shape[-2:])
+            log_inpaint_labels = index_to_log_onehot(inpaint_labels.long(), self.num_classes)
+            log_inpaint_labels = self.resize_to(log_inpaint_labels, shape[-2:])
+            inpaint_masks_image = self.resize_to(inpaint_masks[:, [0]], shape[-2:]).bool()
+            inpaint_masks_label = self.resize_to(inpaint_masks[:, [1]], shape[-2:]).bool()
+        # time
+        timesteps = noise_scheduler.get_sampling_timesteps(batch, device=device)
+        timesteps = [t * (t < 1.) + (1 - 1e-7) * (t >= 1.) for t in timesteps]
+        # whether to skip any steps
+        skip_steps = default(skip_steps, 0)
+        timesteps = timesteps[skip_steps:]
+        for times, times_next in tqdm(timesteps, desc='sampling loop time step', total=len(timesteps), disable=not use_tqdm):
+            is_last_timestep = times_next == 0
+            for r in reversed(range(resample_times)):
+                is_last_resample_step = r == 0
+                if has_inpainting:
+                    noised_inpaint_images, _ = noise_scheduler.q_sample(inpaint_images, t=times)
+                    img = img * ~inpaint_masks_image + noised_inpaint_images * inpaint_masks_image
+                    log_noised_inpaint_labels = noise_scheduler_lbl.q_sample(log_inpaint_labels, t=times)
+                    log_lbl = log_lbl * ~inpaint_masks_label + log_noised_inpaint_labels * inpaint_masks_label
+                self_cond = x_start if unet.self_cond else None
+                self_cond_lbl = lbl_start if unet.self_cond else None
+                img, log_lbl, x_start, lbl_start = self.p_sample(
+                    unet,
+                    img,
+                    log_lbl,
+                    times,
+                    t_next=times_next,
+                    text_embeds=text_embeds,
+                    text_mask=text_mask,
+                    cond_images=cond_images,
+                    cond_scale=cond_scale,
+                    self_cond=self_cond,
+                    self_cond_lbl=self_cond_lbl,
+                    lowres_cond_img=lowres_cond_img,
+                    lowres_cond_lbl=lowres_cond_lbl,
+                    lowres_noise_times=lowres_noise_times,
+                    noise_scheduler=noise_scheduler,
+                    noise_scheduler_lbl=noise_scheduler_lbl,
+                    pred_objective=pred_objective,
+                    dynamic_threshold=dynamic_threshold,
+                )
+                if has_inpainting and not (is_last_resample_step or torch.all(is_last_timestep)):
+                    renoised_img = noise_scheduler.q_sample_from_to(img, times_next, times)
+                    img = torch.where(
+                        self.right_pad_dims_to_datatype(is_last_timestep),
+                        img,
+                        renoised_img
+                    )
+                    renoised_log_lbl = noise_scheduler_lbl.q_sample_from_to(log_lbl, times_next, times)
+                    log_lbl = torch.where(
+                        self.right_pad_dims_to_datatype(is_last_timestep),
+                        log_lbl,
+                        renoised_log_lbl
+                    )
+        img.clamp_(-1., 1.)
+        # final inpainting
+        if has_inpainting:
+            img = img * ~inpaint_masks_image + inpaint_images * inpaint_masks_image
+            log_lbl = log_lbl * ~inpaint_masks_label + log_inpaint_labels * inpaint_masks_label
+        unnormalize_img = self.unnormalize_img(img)
+        lbl = log_onehot_to_index(log_lbl)
+        return unnormalize_img, lbl
+    @torch.no_grad()
+    @eval_decorator
+    def sample(
+        self,
+        texts: List[str] = None,
+        text_masks=None,
+        text_embeds=None,
+        video_frames=None,
+        cond_images=None,
+        inpaint_images=None,
+        inpaint_labels=None,
+        inpaint_masks=None,
+        inpaint_resample_times=5,
+        init_images=None,
+        init_labels=None,
+        skip_steps=None,
+        batch_size=1,
+        cond_scale=1.,
+        lowres_sample_noise_level=None,
+        start_at_unet_number=1,
+        start_image_or_video=None,
+        start_label_or_video=None,
+        stop_at_unet_number=None,
+        return_all_unet_outputs=False,
+        return_pil_images=False,
+        device=None,
+        use_tqdm=True
+    ):
+        device = default(device, self.device)
+        self.reset_unets_all_one_device(device=device)
+        cond_images = maybe(cast_uint8_images_to_float)(cond_images)
+        if exists(texts) and not exists(text_embeds) and not self.unconditional:
+            assert all([*map(len, texts)]), 'text cannot be empty'
+            with autocast(enabled=False):
+                text_embeds, text_masks = self.encode_text(texts, return_attn_mask=True)
+            text_embeds, text_masks = map(lambda t: t.to(device), (text_embeds, text_masks))
+        if not self.unconditional:
+            assert exists(text_embeds), \
+                'text must be passed in if the network was not trained without text `condition_on_text` must be set to `False` when training'
+            text_masks = default(text_masks, lambda: torch.any(text_embeds != 0., dim=-1))
+            batch_size = text_embeds.shape[0]
+        if exists(inpaint_images) and exists(inpaint_labels):
+            if self.unconditional:
+                if batch_size == 1:  # assume researcher wants to broadcast along inpainted images
+                    batch_size = inpaint_images.shape[0]
+            assert inpaint_images.shape[0] == batch_size, \
+                'number of inpainting images must be equal to the specified batch size on sample `sample(batch_size=<int>)``'
+            assert inpaint_labels.shape[0] == batch_size, \
+                'number of inpainting images must be equal to the specified batch size on sample `sample(batch_size=<int>)``'
+            assert not (self.condition_on_text and inpaint_images.shape[0] != text_embeds.shape[0]), \
+                'number of inpainting images must be equal to the number of text to be conditioned on'
+            assert not (self.condition_on_text and inpaint_labels.shape[0] != text_embeds.shape[0]), \
+                'number of inpainting images must be equal to the number of text to be conditioned on'
+        assert not (self.condition_on_text and not exists(text_embeds)), \
+            'text or text encodings must be passed into imagen if specified'
+        assert not (not self.condition_on_text and exists(text_embeds)), \
+            'imagen specified not to be conditioned on text, yet it is presented'
+        assert not (exists(text_embeds) and text_embeds.shape[-1] != self.text_embed_dim), \
+            f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'
+        assert (not (exists(inpaint_images) or exists(inpaint_labels) or exists(inpaint_masks))) \
+            or (exists(inpaint_images) and exists(inpaint_labels) and exists(inpaint_masks)), \
+            'inpaint images, labels and masks must be both passed in to do inpainting'
+        outputs = []
+        is_cuda = next(self.parameters()).is_cuda
+        device = next(self.parameters()).device
+        lowres_sample_noise_level = default(lowres_sample_noise_level, self.lowres_sample_noise_level)
+        num_unets = len(self.unets)
+        # condition scaling
+        cond_scale = cast_tuple(cond_scale, num_unets)
+        # add frame dimension for video
+        assert not (self.is_video and not exists(video_frames)
+                    ), 'video_frames must be passed in on sample time if training on video'
+        frame_dims = (video_frames,) if self.is_video else tuple()
+        # for initial image and skipping steps
+        init_images = cast_tuple(init_images, num_unets)
+        init_images = [maybe(self.normalize_img)(init_image) for init_image in init_images]
+        init_labels = cast_tuple(init_labels, num_unets)
+        skip_steps = cast_tuple(skip_steps, num_unets)
+        # handle starting at a unet greater than 1, for training only-upscaler training
+        if start_at_unet_number > 1:
+            assert start_at_unet_number <= num_unets, 'must start a unet that is less than the total number of unets'
+            assert not exists(stop_at_unet_number) or start_at_unet_number <= stop_at_unet_number
+            assert exists(start_image_or_video), 'starting image or video must be supplied if only doing upscaling'
+            assert exists(start_label_or_video), 'starting image or video must be supplied if only doing upscaling'
+            prev_image_size = self.image_sizes[start_at_unet_number - 2]
+            img = self.resize_to(start_image_or_video, prev_image_size)
+            lbl = self.resize_to(start_label_or_video, prev_image_size)
+        # go through each unet in cascade
+        for unet_number, unet, channel, image_size, noise_scheduler, noise_scheduler_lbl, pred_objective, \
+                dynamic_threshold, unet_cond_scale, unet_init_images, unet_init_labels, unet_skip_steps \
+        in tqdm(zip(range(1, num_unets + 1), self.unets, self.sample_channels, self.image_sizes,
+                    self.noise_schedulers_sample, self.noise_schedulers_lbl_sample, self.pred_objectives,
+                    self.dynamic_thresholding, cond_scale, init_images, init_labels, skip_steps),
+                    disable=not use_tqdm):
+            if unet_number < start_at_unet_number:
+                continue
+            assert not isinstance(unet, NullUnet), 'one cannot sample from null / placeholder unets'
+            context = self.one_unet_in_gpu(unet=unet) if is_cuda else nullcontext()
+            with context:
+                lowres_cond_img = lowres_cond_lbl = lowres_noise_times = None
+                shape = (batch_size, channel, *frame_dims, *image_size)
+                if unet.lowres_cond:
+                    lowres_noise_times = self.lowres_noise_schedule.get_times(
+                        batch_size, lowres_sample_noise_level, device=device)
+                    lowres_cond_img = self.resize_to(img, image_size)
+                    lowres_cond_lbl = self.resize_to(lbl.float(), image_size)
+                    lowres_cond_img = self.normalize_img(lowres_cond_img)
+                    lowres_cond_img, _ = self.lowres_noise_schedule.q_sample(
+                        x_start=lowres_cond_img, t=lowres_noise_times, noise=torch.randn_like(lowres_cond_img))
+                    lowres_cond_log_lbl = index_to_log_onehot(lowres_cond_lbl.long(), self.num_classes)
+                    lowres_cond_log_lbl_noisy = self.lowres_noise_schedule_lbl.q_sample(
+                        lowres_cond_log_lbl, t=lowres_noise_times)
+                    lowres_cond_lbl_noisy = log_onehot_to_index(lowres_cond_log_lbl_noisy)
+                    lowres_cond_lbl = lowres_cond_lbl_noisy  # change just naming
+                if exists(unet_init_images) and exists(unet_init_labels):
+                    unet_init_images = self.resize_to(unet_init_images, image_size)
+                    unet_init_labels = self.resize_to(unet_init_labels, image_size)
+                shape = (batch_size, self.channels, *frame_dims, *image_size)
+                img, lbl = self.p_sample_loop(
+                    unet,
+                    shape,
+                    text_embeds=text_embeds,
+                    text_mask=text_masks,
+                    cond_images=cond_images,
+                    inpaint_images=inpaint_images,
+                    inpaint_labels=inpaint_labels,
+                    inpaint_masks=inpaint_masks,
+                    inpaint_resample_times=inpaint_resample_times,
+                    init_images=unet_init_images,
+                    init_labels=unet_init_labels,
+                    skip_steps=unet_skip_steps,
+                    cond_scale=unet_cond_scale,
+                    lowres_cond_img=lowres_cond_img,
+                    lowres_cond_lbl=lowres_cond_lbl,
+                    lowres_noise_times=lowres_noise_times,
+                    noise_scheduler=noise_scheduler,
+                    noise_scheduler_lbl=noise_scheduler_lbl,
+                    pred_objective=pred_objective,
+                    dynamic_threshold=dynamic_threshold,
+                    use_tqdm=use_tqdm
+                )
+                outputs.append((img.cpu(), lbl.cpu()))
+            if exists(stop_at_unet_number) and stop_at_unet_number == unet_number:
+                break
+        # either return last unet output or all unet outputs
+        output_index = -1 if not return_all_unet_outputs else slice(None)
+        if not return_pil_images:
+            return outputs[output_index]
+        if not return_all_unet_outputs:
+            outputs = outputs[-1:]
+        assert not self.is_video, 'converting sampled video tensor to video file is not supported yet'
+        # TODO lbl pil_images
+        pil_images = list(map(lambda img: list(map(T.ToPILImage(), img.unbind(dim=0))), outputs))
+        # now you have a bunch of pillow images you can just .save(/where/ever/you/want.png)
+        return pil_images[output_index]
+    def p_losses(
+        self,
+        unet: Union[JointUnet, Unet3D, NullUnet, DistributedDataParallel],
+        x_start,
+        lbl_start,
+        times,
+        *,
+        noise_scheduler: GaussianDiffusionContinuousTimes,
+        noise_scheduler_lbl: MultinomialDiffusion,
+        lowres_cond_img=None,
+        lowres_cond_lbl=None,
+        lowres_aug_times=None,
+        text_embeds=None,
+        text_mask=None,
+        cond_images=None,
+        noise=None,
+        noise_lbl=None,
+        times_next=None,
+        pred_objective='noise',
+        p2_loss_weight_gamma=0.,
+        random_crop_size=None
+    ):
+        is_video = x_start.ndim == 5
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        # noise_lbl = default(noise_lbl, lambda: torch.randn_like(x_start))  # TODO
+        # normalize to [-1, 1]
+        x_start = self.normalize_img(x_start)
+        lowres_cond_img = maybe(self.normalize_img)(lowres_cond_img)
+        # random cropping during training
+        # for upsamplers
+        if exists(random_crop_size):
+            if is_video:
+                frames = x_start.shape[2]
+                x_start, lowres_cond_img, noise = rearrange_many(
+                    (x_start, lowres_cond_img, noise), 'b c f h w -> (b f) c h w')
+            aug = K.RandomCrop(random_crop_size, p=1.)
+            # make sure low res conditioner and image both get augmented the same way
+            # detailed https://kornia.readthedocs.io/en/latest/augmentation.module.html?highlight=randomcrop#kornia.augmentation.RandomCrop
+            x_start = aug(x_start)
+            lbl_start = aug(lbl_start, params=aug._params)
+            lowres_cond_img = aug(lowres_cond_img, params=aug._params)
+            lowres_cond_lbl = aug(lowres_cond_lbl, params=aug._params)
+            noise = aug(noise, params=aug._params)
+            if is_video:
+                x_start, lowres_cond_img, noise = rearrange_many(
+                    (x_start, lowres_cond_img, noise), '(b f) c h w -> b c f h w', f=frames)
+        # get x_t
+        x_noisy, log_snr = noise_scheduler.q_sample(x_start=x_start, t=times, noise=noise)
+        log_lbl_start = index_to_log_onehot(lbl_start.long(), self.num_classes)
+        log_lbl_noisy = noise_scheduler_lbl.q_sample(log_lbl_start, t=times)
+        lbl_noisy = log_onehot_to_index(log_lbl_noisy)
+        # also noise the lowres conditioning image
+        # at sample time, they then fix the noise level of 0.1 - 0.3
+        lowres_cond_img_noisy = None
+        lowres_cond_lbl_noisy = None
+        if exists(lowres_cond_img) and exists(lowres_cond_lbl):
+            lowres_aug_times = default(lowres_aug_times, times)
+            lowres_cond_img_noisy, _ = self.lowres_noise_schedule.q_sample(
+                x_start=lowres_cond_img, t=lowres_aug_times, noise=torch.randn_like(lowres_cond_img))
+            lowres_cond_log_lbl = index_to_log_onehot(lowres_cond_lbl.long(), self.num_classes)
+            lowres_cond_log_lbl_noisy = self.lowres_noise_schedule_lbl.q_sample(
+                lowres_cond_log_lbl, t=lowres_aug_times)
+            lowres_cond_lbl_noisy = log_onehot_to_index(lowres_cond_log_lbl_noisy)
+        # time condition
+        noise_cond = noise_scheduler.get_condition(times)
+        # unet kwargs
+        unet_kwargs = dict(
+            text_embeds=text_embeds,
+            text_mask=text_mask,
+            cond_images=cond_images,
+            lowres_noise_times=self.lowres_noise_schedule.get_condition(lowres_aug_times),
+            lowres_cond_img=lowres_cond_img_noisy,
+            lowres_cond_lbl=lowres_cond_lbl_noisy,
+            cond_drop_prob=self.cond_drop_prob,
+        )
+        # self condition if needed
+        # Because 'unet' can be an instance of DistributedDataParallel coming from the
+        # ImagenTrainer.unet_being_trained when invoking ImagenTrainer.forward(), we need to
+        # access the member 'module' of the wrapped unet instance.
+        self_cond = unet.module.self_cond if isinstance(unet, DistributedDataParallel) else unet.self_cond
+        if self_cond and random() < 0.5:
+            with torch.no_grad():
+                pred, pred_lbl = unet.forward(
+                    x_noisy,
+                    lbl_noisy,
+                    noise_cond,
+                    **unet_kwargs
+                ).detach()
+                pred_lbl = F.log_softmax(pred_lbl, dim=1)
+                pred_lbl = noise_scheduler_lbl.q_posterior(pred_lbl, log_lbl_noisy, times)
+                x_start = noise_scheduler.predict_start_from_noise(
+                    x_noisy, t=times, noise=pred) if pred_objective == 'noise' else pred
+                # lbl_start = noise_scheduler_lbl.predict_start_from_noise(
+                #     lbl_noisy, t=times, noise=pred_lbl) if pred_objective == 'noise' else pred_lbl # TODO ???
+                lbl_start = None
+                unet_kwargs = {**unet_kwargs, 'self_cond': x_start, 'self_cond_lbl': lbl_start}
+        # get prediction
+        pred, pred_lbl = unet.forward(
+            x_noisy,
+            lbl_noisy,
+            noise_cond,
+            **unet_kwargs
+        )
+        pred_lbl = F.log_softmax(pred_lbl, dim=1)
+        pred_lbl_post = noise_scheduler_lbl.q_posterior(pred_lbl, log_lbl_noisy, times)
+        # prediction objective
+        if pred_objective == 'noise':
+            target = noise
+        elif pred_objective == 'x_start':
+            target = x_start
+        else:
+            raise ValueError(f'unknown objective {pred_objective}')
+        target_log_lbl = noise_scheduler_lbl.q_posterior(log_lbl_start, log_lbl_noisy, times)
+        # losses
+        losses = self.loss_fn(pred, target, reduction='none')
+        losses = reduce(losses, 'b ... -> b', 'mean')
+        losses_lbl = noise_scheduler_lbl.loss_fn(target_log_lbl, pred_lbl_post, times, log_lbl_start)
+        # p2 loss reweighting
+        if p2_loss_weight_gamma > 0:
+            loss_weight = (self.p2_loss_weight_k + log_snr.exp()) ** -p2_loss_weight_gamma
+            losses = losses * loss_weight
+            losses_lbl = losses_lbl * loss_weight
+        return losses.mean(), losses_lbl.mean()
+    def forward(
+        self,
+        images,
+        labels,
+        unet: Union[JointUnet, Unet3D, NullUnet, DistributedDataParallel] = None,
+        texts: List[str] = None,
+        text_embeds=None,
+        text_masks=None,
+        unet_number=None,
+        cond_images=None
+    ):
+        # assert images.shape[-1] == images.shape[-2], \
+        #     f'the images you pass in must be a square, but received dimensions of {images.shape[2]}, {images.shape[-1]}'
+        assert not (len(self.unets) > 1 and not exists(unet_number)), \
+            f'you must specify which unet you want trained, from a range of 1 to {len(self.unets)}, if you are training cascading DDPM (multiple unets)'
+        unet_number = default(unet_number, 1)
+        assert not exists(self.only_train_unet_number) or self.only_train_unet_number == unet_number, \
+            'you can only train on unet #{self.only_train_unet_number}'
+        images = cast_uint8_images_to_float(images)
+        cond_images = maybe(cast_uint8_images_to_float)(cond_images)
+        assert is_float_dtype(images.dtype), f'images tensor needs to be floats but {images.dtype} dtype found instead'
+        unet_index = unet_number - 1
+        unet = default(unet, lambda: self.get_unet(unet_number))
+        assert not isinstance(unet, NullUnet), 'null unet cannot and should not be trained'
+        noise_scheduler = self.noise_schedulers[unet_index]
+        noise_scheduler_lbl = self.noise_schedulers_lbl[unet_index]
+        p2_loss_weight_gamma = self.p2_loss_weight_gamma[unet_index]
+        pred_objective = self.pred_objectives[unet_index]
+        target_image_size = self.image_sizes[unet_index]
+        random_crop_size = self.random_crop_sizes[unet_index]
+        prev_image_size = self.image_sizes[unet_index - 1] if unet_index > 0 else None
+        b, c, *_, h, w, device, is_video = *images.shape, images.device, images.ndim == 5
+        check_shape(images, 'b c ...', c=self.channels)
+        assert h >= target_image_size[0] and w >= target_image_size[1]
+        frames = images.shape[2] if is_video else None
+        times = noise_scheduler.sample_random_times(b, device=device)
+        if exists(texts) and not exists(text_embeds) and not self.unconditional:
+            assert all([*map(len, texts)]), 'text cannot be empty'
+            assert len(texts) == len(images), 'number of text captions does not match up with the number of images given'
+            with autocast(enabled=False):
+                text_embeds, text_masks = self.encode_text(texts, return_attn_mask=True)
+            text_embeds, text_masks = map(lambda t: t.to(images.device), (text_embeds, text_masks))
+        if not self.unconditional:
+            text_masks = default(text_masks, lambda: torch.any(text_embeds != 0., dim=-1))
+        assert not (self.condition_on_text and not exists(text_embeds)
+                    ), 'text or text encodings must be passed into decoder if specified'
+        assert not (not self.condition_on_text and exists(text_embeds)
+                    ), 'decoder specified not to be conditioned on text, yet it is presented'
+        assert not (exists(
+            text_embeds) and text_embeds.shape[-1] != self.text_embed_dim), f'invalid text embedding dimension being passed in (should be {self.text_embed_dim})'
+        lowres_cond_img = lowres_cond_lbl = lowres_aug_times = None
+        if exists(prev_image_size):
+            lowres_cond_img = self.resize_to(images, prev_image_size, clamp_range=self.input_image_range)
+            lowres_cond_img = self.resize_to(lowres_cond_img, target_image_size, clamp_range=self.input_image_range)
+            lowres_cond_lbl = self.resize_to(labels, prev_image_size, clamp_range=None)
+            lowres_cond_lbl = self.resize_to(lowres_cond_lbl, target_image_size, clamp_range=None)
+            if self.per_sample_random_aug_noise_level:
+                lowres_aug_times = self.lowres_noise_schedule.sample_random_times(
+                    b, self.lowres_max_thres, device=device)
+            else:
+                lowres_aug_time = self.lowres_noise_schedule.sample_random_times(
+                    1, self.lowres_max_thres, device=device)
+                lowres_aug_times = repeat(lowres_aug_time, '1 -> b', b=b)
+        images = self.resize_to(images, target_image_size)
+        labels = self.resize_to(labels, target_image_size)
+        return self.p_losses(unet, images, labels, times, text_embeds=text_embeds, text_mask=text_masks, cond_images=cond_images, noise_scheduler=noise_scheduler, noise_scheduler_lbl=noise_scheduler_lbl, lowres_cond_img=lowres_cond_img, lowres_cond_lbl=lowres_cond_lbl, lowres_aug_times=lowres_aug_times, pred_objective=pred_objective, p2_loss_weight_gamma=p2_loss_weight_gamma, random_crop_size=random_crop_size)

imagen_pytorch/t5.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+import transformers
+from typing import List
+from transformers import T5Tokenizer, T5EncoderModel, T5Config
+from einops import rearrange
+transformers.logging.set_verbosity_error()
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+# config
+MAX_LENGTH = 256
+DEFAULT_T5_NAME = 'google/t5-v1_1-base'
+T5_CONFIGS = {}
+# singleton globals
+def get_tokenizer(name):
+    tokenizer = T5Tokenizer.from_pretrained(name, model_max_length=MAX_LENGTH, cache_dir="checkpoints")
+    return tokenizer
+def get_model(name):
+    model = T5EncoderModel.from_pretrained(name, cache_dir="checkpoints")
+    return model
+def get_model_and_tokenizer(name):
+    global T5_CONFIGS
+    if name not in T5_CONFIGS:
+        T5_CONFIGS[name] = dict()
+    if "model" not in T5_CONFIGS[name]:
+        T5_CONFIGS[name]["model"] = get_model(name)
+    if "tokenizer" not in T5_CONFIGS[name]:
+        T5_CONFIGS[name]["tokenizer"] = get_tokenizer(name)
+    return T5_CONFIGS[name]['model'], T5_CONFIGS[name]['tokenizer']
+def get_encoded_dim(name):
+    if name not in T5_CONFIGS:
+        # avoids loading the model if we only want to get the dim
+        config = T5Config.from_pretrained(name, cache_dir="checkpoints")
+        T5_CONFIGS[name] = dict(config=config)
+    elif "config" in T5_CONFIGS[name]:
+        config = T5_CONFIGS[name]["config"]
+    elif "model" in T5_CONFIGS[name]:
+        config = T5_CONFIGS[name]["model"].config
+    else:
+        assert False
+    return config.d_model
+# encoding text
+def t5_tokenize(
+    texts: List[str],
+    name = DEFAULT_T5_NAME
+):
+    t5, tokenizer = get_model_and_tokenizer(name)
+    if torch.cuda.is_available():
+        t5 = t5.cuda()
+    device = next(t5.parameters()).device
+    encoded = tokenizer.batch_encode_plus(
+        texts,
+        return_tensors = "pt",
+        padding = 'longest',
+        max_length = MAX_LENGTH,
+        truncation = True
+    )
+    input_ids = encoded.input_ids.to(device)
+    attn_mask = encoded.attention_mask.to(device)
+    return input_ids, attn_mask
+def t5_encode_tokenized_text(
+    token_ids,
+    attn_mask = None,
+    pad_id = None,
+    name = DEFAULT_T5_NAME
+):
+    assert exists(attn_mask) or exists(pad_id)
+    t5, _ = get_model_and_tokenizer(name)
+    attn_mask = default(attn_mask, lambda: (token_ids != pad_id).long())
+    t5.eval()
+    with torch.no_grad():
+        output = t5(input_ids = token_ids, attention_mask = attn_mask)
+        encoded_text = output.last_hidden_state.detach()
+    attn_mask = attn_mask.bool()
+    encoded_text = encoded_text.masked_fill(~rearrange(attn_mask, '... -> ... 1'), 0.) # just force all embeddings that is padding to be equal to 0.
+    return encoded_text
+def t5_encode_text(
+    texts: List[str],
+    name = DEFAULT_T5_NAME,
+    return_attn_mask = False
+):
+    token_ids, attn_mask = t5_tokenize(texts, name = name)
+    encoded_text = t5_encode_tokenized_text(token_ids, attn_mask = attn_mask, name = name)
+    if return_attn_mask:
+        attn_mask = attn_mask.bool()
+        return encoded_text, attn_mask
+    return encoded_text

imagen_pytorch/trainer.py ADDED Viewed

	@@ -0,0 +1,1782 @@

+from collections import OrderedDict
+import os
+import time
+import copy
+from pathlib import Path
+from math import ceil
+from contextlib import contextmanager, nullcontext
+from functools import partial, wraps
+from collections.abc import Iterable
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.utils.data import random_split, DataLoader
+from torch.optim import Adam
+from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
+from torch.cuda.amp import autocast, GradScaler
+import pytorch_warmup as warmup
+from imagen_pytorch.imagen_pytorch import Imagen, NullUnet
+from imagen_pytorch.elucidated_imagen import ElucidatedImagen
+from imagen_pytorch.joint_imagen import JointImagen
+from imagen_pytorch.data import cycle
+from imagen_pytorch.version import __version__
+from packaging import version
+import numpy as np
+from ema_pytorch import EMA
+from accelerate import Accelerator, DistributedType, DistributedDataParallelKwargs
+from fsspec.core import url_to_fs
+from fsspec.implementations.local import LocalFileSystem
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def cast_tuple(val, length=1):
+    if isinstance(val, list):
+        val = tuple(val)
+    return val if isinstance(val, tuple) else ((val,) * length)
+def find_first(fn, arr):
+    for ind, el in enumerate(arr):
+        if fn(el):
+            return ind
+    return -1
+def pick_and_pop(keys, d):
+    values = list(map(lambda key: d.pop(key), keys))
+    return dict(zip(keys, values))
+def group_dict_by_key(cond, d):
+    return_val = [dict(), dict()]
+    for key in d.keys():
+        match = bool(cond(key))
+        ind = int(not match)
+        return_val[ind][key] = d[key]
+    return (*return_val,)
+def string_begins_with(prefix, str):
+    return str.startswith(prefix)
+def group_by_key_prefix(prefix, d):
+    return group_dict_by_key(partial(string_begins_with, prefix), d)
+def groupby_prefix_and_trim(prefix, d):
+    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
+    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
+    return kwargs_without_prefix, kwargs
+def num_to_groups(num, divisor):
+    groups = num // divisor
+    remainder = num % divisor
+    arr = [divisor] * groups
+    if remainder > 0:
+        arr.append(remainder)
+    return arr
+# url to fs, bucket, path - for checkpointing to cloud
+def url_to_bucket(url):
+    if '://' not in url:
+        return url
+    prefix, suffix = url.split('://')
+    if prefix in {'gs', 's3'}:
+        return suffix.split('/')[0]
+    else:
+        raise ValueError(f'storage type prefix "{prefix}" is not supported yet')
+# decorators
+def eval_decorator(fn):
+    def inner(model, *args, **kwargs):
+        was_training = model.training
+        model.eval()
+        out = fn(model, *args, **kwargs)
+        model.train(was_training)
+        return out
+    return inner
+def cast_torch_tensor(fn, cast_fp16=False):
+    @wraps(fn)
+    def inner(model, *args, **kwargs):
+        device = kwargs.pop('_device', model.device)
+        cast_device = kwargs.pop('_cast_device', True)
+        should_cast_fp16 = cast_fp16 and model.cast_half_at_training
+        kwargs_keys = kwargs.keys()
+        all_args = (*args, *kwargs.values())
+        split_kwargs_index = len(all_args) - len(kwargs_keys)
+        all_args = tuple(map(lambda t: torch.from_numpy(t) if exists(t) and isinstance(t, np.ndarray) else t, all_args))
+        if cast_device:
+            all_args = tuple(map(lambda t: t.to(device) if exists(t) and isinstance(t, torch.Tensor) else t, all_args))
+        if should_cast_fp16:
+            all_args = tuple(map(lambda t: t.half() if exists(t) and isinstance(
+                t, torch.Tensor) and t.dtype != torch.bool else t, all_args))
+        args, kwargs_values = all_args[:split_kwargs_index], all_args[split_kwargs_index:]
+        kwargs = dict(tuple(zip(kwargs_keys, kwargs_values)))
+        out = fn(model, *args, **kwargs)
+        return out
+    return inner
+# gradient accumulation functions
+def split_iterable(it, split_size):
+    accum = []
+    for ind in range(ceil(len(it) / split_size)):
+        start_index = ind * split_size
+        accum.append(it[start_index: (start_index + split_size)])
+    return accum
+def split(t, split_size=None):
+    if not exists(split_size):
+        return t
+    if isinstance(t, torch.Tensor):
+        return t.split(split_size, dim=0)
+    if isinstance(t, Iterable):
+        return split_iterable(t, split_size)
+    return TypeError
+def find_first(cond, arr):
+    for el in arr:
+        if cond(el):
+            return el
+    return None
+def split_args_and_kwargs(*args, split_size=None, **kwargs):
+    all_args = (*args, *kwargs.values())
+    len_all_args = len(all_args)
+    first_tensor = find_first(lambda t: isinstance(t, torch.Tensor), all_args)
+    assert exists(first_tensor)
+    batch_size = len(first_tensor)
+    split_size = default(split_size, batch_size)
+    num_chunks = ceil(batch_size / split_size)
+    dict_len = len(kwargs)
+    dict_keys = kwargs.keys()
+    split_kwargs_index = len_all_args - dict_len
+    split_all_args = [split(arg, split_size=split_size) if exists(arg) and isinstance(
+        arg, (torch.Tensor, Iterable)) else ((arg,) * num_chunks) for arg in all_args]
+    chunk_sizes = tuple(map(len, split_all_args[0]))
+    for (chunk_size, *chunked_all_args) in tuple(zip(chunk_sizes, *split_all_args)):
+        chunked_args, chunked_kwargs_values = chunked_all_args[:
+                                                               split_kwargs_index], chunked_all_args[split_kwargs_index:]
+        chunked_kwargs = dict(tuple(zip(dict_keys, chunked_kwargs_values)))
+        chunk_size_frac = chunk_size / batch_size
+        yield chunk_size_frac, (chunked_args, chunked_kwargs)
+# imagen trainer
+def imagen_sample_in_chunks(fn):
+    @wraps(fn)
+    def inner(self, *args, max_batch_size=None, **kwargs):
+        if not exists(max_batch_size):
+            return fn(self, *args, **kwargs)
+        if self.imagen.unconditional:
+            batch_size = kwargs.get('batch_size')
+            batch_sizes = num_to_groups(batch_size, max_batch_size)
+            outputs = [fn(self, *args, **{**kwargs, 'batch_size': sub_batch_size}) for sub_batch_size in batch_sizes]
+        else:
+            outputs = [fn(self, *chunked_args, **chunked_kwargs) for _, (chunked_args, chunked_kwargs)
+                       in split_args_and_kwargs(*args, split_size=max_batch_size, **kwargs)]
+        if isinstance(outputs[0], torch.Tensor):
+            return torch.cat(outputs, dim=0)
+        return list(map(lambda t: torch.cat(t, dim=0), list(zip(*outputs))))
+    return inner
+def restore_parts(state_dict_target, state_dict_from):
+    for name, param in state_dict_from.items():
+        if name not in state_dict_target:
+            continue
+        if param.size() == state_dict_target[name].size():
+            state_dict_target[name].copy_(param)
+        else:
+            print(f"layer {name}({param.size()} different than target: {state_dict_target[name].size()}")
+    return state_dict_target
+def load_unet_from_trainer(trainer, checkpoint_path, src_unet_idx, tgt_unet_idx, only_model=True):
+    assert only_model == True  # TODO optimizer, scheduler ...
+    ckpt = torch.load(checkpoint_path, map_location='cpu')
+    state_dict = OrderedDict()
+    for key, val in ckpt['model'].items():
+        if key.startswith(f'unets.{src_unet_idx}'):
+            state_dict[key[8:]] = val
+    trainer.imagen.unets[tgt_unet_idx].load_state_dict(state_dict)
+    state_dict = OrderedDict()
+    for key, val in ckpt['ema'].items():
+        if key.startswith(f'{src_unet_idx}.'):
+            state_dict[key[2:]] = val
+    trainer.ema_unets[tgt_unet_idx].load_state_dict(state_dict)
+    return
+class ImagenTrainer(nn.Module):
+    locked = False
+    def __init__(
+        self,
+        imagen=None,
+        imagen_checkpoint_path=None,
+        use_ema=True,
+        lr=1e-4,
+        eps=1e-8,
+        beta1=0.9,
+        beta2=0.99,
+        max_grad_norm=None,
+        group_wd_params=True,
+        warmup_steps=None,
+        cosine_decay_max_steps=None,
+        only_train_unet_number=None,
+        fp16=False,
+        precision=None,
+        split_batches=True,
+        dl_tuple_output_keywords_names=('images', 'texts'),
+        verbose=True,
+        split_valid_fraction=0.025,
+        split_valid_from_train=False,
+        split_random_seed=42,
+        checkpoint_path=None,
+        checkpoint_every=None,
+        checkpoint_fs=None,
+        fs_kwargs: dict = None,
+        max_checkpoints_keep=20,
+        **kwargs
+    ):
+        super().__init__()
+        assert not ImagenTrainer.locked, 'ImagenTrainer can only be initialized once per process - for the sake of distributed training, you will now have to create a separate script to train each unet (or a script that accepts unet number as an argument)'
+        assert exists(imagen) ^ exists(
+            imagen_checkpoint_path), 'either imagen instance is passed into the trainer, or a checkpoint path that contains the imagen config'
+        # determine filesystem, using fsspec, for saving to local filesystem or cloud
+        self.fs = checkpoint_fs
+        if not exists(self.fs):
+            fs_kwargs = default(fs_kwargs, {})
+            self.fs, _ = url_to_fs(default(checkpoint_path, './'), **fs_kwargs)
+        assert isinstance(imagen, (Imagen, ElucidatedImagen))
+        ema_kwargs, kwargs = groupby_prefix_and_trim('ema_', kwargs)
+        # elucidated or not
+        self.is_elucidated = isinstance(imagen, ElucidatedImagen)
+        # create accelerator instance
+        accelerate_kwargs, kwargs = groupby_prefix_and_trim('accelerate_', kwargs)
+        assert not (fp16 and exists(precision)
+                    ), 'either set fp16 = True or forward the precision ("fp16", "bf16") to Accelerator'
+        accelerator_mixed_precision = default(precision, 'fp16' if fp16 else 'no')
+        self.accelerator = Accelerator(**{
+            'split_batches': split_batches,
+            'mixed_precision': accelerator_mixed_precision,
+            'kwargs_handlers': [DistributedDataParallelKwargs(find_unused_parameters=True)], **accelerate_kwargs})
+        ImagenTrainer.locked = self.is_distributed
+        # cast data to fp16 at training time if needed
+        self.cast_half_at_training = accelerator_mixed_precision == 'fp16'
+        # grad scaler must be managed outside of accelerator
+        grad_scaler_enabled = fp16
+        # imagen, unets and ema unets
+        self.imagen = imagen
+        self.num_unets = len(self.imagen.unets)
+        self.use_ema = use_ema and self.is_main
+        self.ema_unets = nn.ModuleList([])
+        # keep track of what unet is being trained on
+        # only going to allow 1 unet training at a time
+        self.ema_unet_being_trained_index = -1  # keeps track of which ema unet is being trained on
+        # data related functions
+        self.train_dl_iter = None
+        self.train_dl = None
+        self.valid_dl_iter = None
+        self.valid_dl = None
+        self.dl_tuple_output_keywords_names = dl_tuple_output_keywords_names
+        # auto splitting validation from training, if dataset is passed in
+        self.split_valid_from_train = split_valid_from_train
+        assert 0 <= split_valid_fraction <= 1, 'split valid fraction must be between 0 and 1'
+        self.split_valid_fraction = split_valid_fraction
+        self.split_random_seed = split_random_seed
+        # be able to finely customize learning rate, weight decay
+        # per unet
+        lr, eps, warmup_steps, cosine_decay_max_steps = map(
+            partial(cast_tuple, length=self.num_unets), (lr, eps, warmup_steps, cosine_decay_max_steps))
+        for ind, (unet, unet_lr, unet_eps, unet_warmup_steps, unet_cosine_decay_max_steps) in enumerate(zip(self.imagen.unets, lr, eps, warmup_steps, cosine_decay_max_steps)):
+            optimizer = Adam(
+                unet.parameters(),
+                lr=unet_lr,
+                eps=unet_eps,
+                betas=(beta1, beta2),
+                **kwargs
+            )
+            if self.use_ema:
+                self.ema_unets.append(EMA(unet, **ema_kwargs))
+            scaler = GradScaler(enabled=grad_scaler_enabled)
+            scheduler = warmup_scheduler = None
+            if exists(unet_cosine_decay_max_steps):
+                scheduler = CosineAnnealingLR(optimizer, T_max=unet_cosine_decay_max_steps)
+            if exists(unet_warmup_steps):
+                warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=unet_warmup_steps)
+                if not exists(scheduler):
+                    scheduler = LambdaLR(optimizer, lr_lambda=lambda step: 1.0)
+            # set on object
+            setattr(self, f'optim{ind}', optimizer)  # cannot use pytorch ModuleList for some reason with optimizers
+            setattr(self, f'scaler{ind}', scaler)
+            setattr(self, f'scheduler{ind}', scheduler)
+            setattr(self, f'warmup{ind}', warmup_scheduler)
+        # gradient clipping if needed
+        self.max_grad_norm = max_grad_norm
+        # step tracker and misc
+        self.register_buffer('steps', torch.tensor([0] * self.num_unets))
+        self.verbose = verbose
+        # automatic set devices based on what accelerator decided
+        self.imagen.to(self.device)
+        self.to(self.device)
+        # checkpointing
+        assert not (exists(checkpoint_path) ^ exists(checkpoint_every))
+        self.checkpoint_path = checkpoint_path
+        self.checkpoint_every = checkpoint_every
+        self.max_checkpoints_keep = max_checkpoints_keep
+        self.can_checkpoint = self.is_local_main if isinstance(checkpoint_fs, LocalFileSystem) else self.is_main
+        if exists(checkpoint_path) and self.can_checkpoint:
+            bucket = url_to_bucket(checkpoint_path)
+            if not self.fs.exists(bucket):
+                self.fs.mkdir(bucket)
+            self.load_from_checkpoint_folder()
+        # only allowing training for unet
+        self.only_train_unet_number = only_train_unet_number
+        self.validate_and_set_unet_being_trained(only_train_unet_number)
+    # computed values
+    @property
+    def device(self):
+        return self.accelerator.device
+    @property
+    def is_distributed(self):
+        return not (self.accelerator.distributed_type == DistributedType.NO and self.accelerator.num_processes == 1)
+    @property
+    def is_main(self):
+        return self.accelerator.is_main_process
+    @property
+    def is_local_main(self):
+        return self.accelerator.is_local_main_process
+    @property
+    def unwrapped_unet(self):
+        return self.accelerator.unwrap_model(self.unet_being_trained)
+    # optimizer helper functions
+    def get_lr(self, unet_number):
+        self.validate_unet_number(unet_number)
+        unet_index = unet_number - 1
+        optim = getattr(self, f'optim{unet_index}')
+        return optim.param_groups[0]['lr']
+    # function for allowing only one unet from being trained at a time
+    def validate_and_set_unet_being_trained(self, unet_number=None):
+        if exists(unet_number):
+            self.validate_unet_number(unet_number)
+        assert not exists(self.only_train_unet_number) or self.only_train_unet_number == unet_number, 'you cannot only train on one unet at a time. you will need to save the trainer into a checkpoint, and resume training on a new unet'
+        self.only_train_unet_number = unet_number
+        self.imagen.only_train_unet_number = unet_number
+        if not exists(unet_number):
+            return
+        self.wrap_unet(unet_number)
+    def wrap_unet(self, unet_number):
+        if hasattr(self, 'one_unet_wrapped'):
+            return
+        unet = self.imagen.get_unet(unet_number)
+        self.unet_being_trained = self.accelerator.prepare(unet)
+        unet_index = unet_number - 1
+        optimizer = getattr(self, f'optim{unet_index}')
+        scheduler = getattr(self, f'scheduler{unet_index}')
+        optimizer = self.accelerator.prepare(optimizer)
+        if exists(scheduler):
+            scheduler = self.accelerator.prepare(scheduler)
+        setattr(self, f'optim{unet_index}', optimizer)
+        setattr(self, f'scheduler{unet_index}', scheduler)
+        self.one_unet_wrapped = True
+    # hacking accelerator due to not having separate gradscaler per optimizer
+    def set_accelerator_scaler(self, unet_number):
+        unet_number = self.validate_unet_number(unet_number)
+        scaler = getattr(self, f'scaler{unet_number - 1}')
+        self.accelerator.scaler = scaler
+        for optimizer in self.accelerator._optimizers:
+            optimizer.scaler = scaler
+    # helper print
+    def print(self, msg):
+        if not self.is_main:
+            return
+        if not self.verbose:
+            return
+        return self.accelerator.print(msg)
+    # validating the unet number
+    def validate_unet_number(self, unet_number=None):
+        if self.num_unets == 1:
+            unet_number = default(unet_number, 1)
+        assert 0 < unet_number <= self.num_unets, f'unet number should be in between 1 and {self.num_unets}'
+        return unet_number
+    # number of training steps taken
+    def num_steps_taken(self, unet_number=None):
+        if self.num_unets == 1:
+            unet_number = default(unet_number, 1)
+        return self.steps[unet_number - 1].item()
+    def print_untrained_unets(self):
+        print_final_error = False
+        for ind, (steps, unet) in enumerate(zip(self.steps.tolist(), self.imagen.unets)):
+            if steps > 0 or isinstance(unet, NullUnet):
+                continue
+            self.print(f'unet {ind + 1} has not been trained')
+            print_final_error = True
+        if print_final_error:
+            self.print(
+                'when sampling, you can pass stop_at_unet_number to stop early in the cascade, so it does not try to generate with untrained unets')
+    # data related functions
+    def add_train_dataloader(self, dl=None):
+        if not exists(dl):
+            return
+        assert not exists(self.train_dl), 'training dataloader was already added'
+        self.train_dl = self.accelerator.prepare(dl)
+    def add_valid_dataloader(self, dl):
+        if not exists(dl):
+            return
+        assert not exists(self.valid_dl), 'validation dataloader was already added'
+        self.valid_dl = self.accelerator.prepare(dl)
+    def add_train_dataset(self, ds=None, *, batch_size, **dl_kwargs):
+        if not exists(ds):
+            return
+        assert not exists(self.train_dl), 'training dataloader was already added'
+        valid_ds = None
+        if self.split_valid_from_train:
+            train_size = int((1 - self.split_valid_fraction) * len(ds))
+            valid_size = len(ds) - train_size
+            ds, valid_ds = random_split(ds, [train_size, valid_size],
+                                        generator=torch.Generator().manual_seed(self.split_random_seed))
+            self.print(f'training with dataset of {len(ds)} samples '
+                       f'and validating with randomly splitted {len(valid_ds)} samples')
+        dl = DataLoader(ds, batch_size=batch_size, **dl_kwargs)
+        self.train_dl = self.accelerator.prepare(dl)
+        if not self.split_valid_from_train:
+            return
+        self.add_valid_dataset(valid_ds, batch_size=batch_size, **dl_kwargs)
+    def add_valid_dataset(self, ds, *, batch_size, **dl_kwargs):
+        if not exists(ds):
+            return
+        assert not exists(self.valid_dl), 'validation dataloader was already added'
+        dl = DataLoader(ds, batch_size=batch_size, **dl_kwargs)
+        self.valid_dl = self.accelerator.prepare(dl)
+    def create_train_iter(self):
+        assert exists(self.train_dl), 'training dataloader has not been registered with the trainer yet'
+        if exists(self.train_dl_iter):
+            return
+        self.train_dl_iter = cycle(self.train_dl)
+    def create_valid_iter(self):
+        assert exists(self.valid_dl), 'validation dataloader has not been registered with the trainer yet'
+        if exists(self.valid_dl_iter):
+            return
+        self.valid_dl_iter = cycle(self.valid_dl)
+    def train_step(self, unet_number=None, **kwargs):
+        self.create_train_iter()
+        loss = self.step_with_dl_iter(self.train_dl_iter, unet_number=unet_number, **kwargs)
+        self.update(unet_number=unet_number)
+        return loss
+    @torch.no_grad()
+    @eval_decorator
+    def valid_step(self, **kwargs):
+        self.create_valid_iter()
+        context = self.use_ema_unets if kwargs.pop('use_ema_unets', False) else nullcontext
+        with context():
+            loss = self.step_with_dl_iter(self.valid_dl_iter, **kwargs)
+        return loss
+    def step_with_dl_iter(self, dl_iter, **kwargs):
+        dl_tuple_output = cast_tuple(next(dl_iter))
+        model_input = dict(list(zip(self.dl_tuple_output_keywords_names, dl_tuple_output)))
+        loss = self.forward(**{**kwargs, **model_input})
+        return loss
+    # checkpointing functions
+    @property
+    def all_checkpoints_sorted(self):
+        glob_pattern = os.path.join(self.checkpoint_path, '*.pt')
+        checkpoints = self.fs.glob(glob_pattern)
+        sorted_checkpoints = sorted(checkpoints, key=lambda x: int(str(x).split('.')[-2]), reverse=True)
+        return sorted_checkpoints
+    def load_from_checkpoint_folder(self, last_total_steps=-1):
+        if last_total_steps != -1:
+            filepath = os.path.join(self.checkpoint_path, f'checkpoint.{last_total_steps}.pt')
+            self.load(filepath)
+            return
+        sorted_checkpoints = self.all_checkpoints_sorted
+        if len(sorted_checkpoints) == 0:
+            self.print(f'no checkpoints found to load from at {self.checkpoint_path}')
+            return
+        last_checkpoint = sorted_checkpoints[0]
+        self.load(last_checkpoint)
+    def save_to_checkpoint_folder(self):
+        self.accelerator.wait_for_everyone()
+        if not self.can_checkpoint:
+            return
+        total_steps = int(self.steps.sum().item())
+        filepath = os.path.join(self.checkpoint_path, f'checkpoint.{total_steps}.pt')
+        self.save(filepath)
+        if self.max_checkpoints_keep <= 0:
+            return
+        sorted_checkpoints = self.all_checkpoints_sorted
+        checkpoints_to_discard = sorted_checkpoints[self.max_checkpoints_keep:]
+        for checkpoint in checkpoints_to_discard:
+            self.fs.rm(checkpoint)
+    # saving and loading functions
+    def save(
+        self,
+        path,
+        overwrite=True,
+        without_optim_and_sched=False,
+        **kwargs
+    ):
+        # self.accelerator.wait_for_everyone()
+        if not self.can_checkpoint:
+            return
+        fs = self.fs
+        assert not (fs.exists(path) and not overwrite)
+        self.reset_ema_unets_all_one_device()
+        save_obj = dict(
+            model=self.imagen.state_dict(),
+            version=__version__,
+            steps=self.steps.cpu(),
+            **kwargs
+        )
+        save_optim_and_sched_iter = range(0, self.num_unets) if not without_optim_and_sched else tuple()
+        for ind in save_optim_and_sched_iter:
+            scaler_key = f'scaler{ind}'
+            optimizer_key = f'optim{ind}'
+            scheduler_key = f'scheduler{ind}'
+            warmup_scheduler_key = f'warmup{ind}'
+            scaler = getattr(self, scaler_key)
+            optimizer = getattr(self, optimizer_key)
+            scheduler = getattr(self, scheduler_key)
+            warmup_scheduler = getattr(self, warmup_scheduler_key)
+            if exists(scheduler):
+                save_obj = {**save_obj, scheduler_key: scheduler.state_dict()}
+            if exists(warmup_scheduler):
+                save_obj = {**save_obj, warmup_scheduler_key: warmup_scheduler.state_dict()}
+            save_obj = {**save_obj, scaler_key: scaler.state_dict(), optimizer_key: optimizer.state_dict()}
+        if self.use_ema:
+            save_obj = {**save_obj, 'ema': self.ema_unets.state_dict()}
+        # determine if imagen config is available
+        if hasattr(self.imagen, '_config'):
+            self.print(f'this checkpoint is commandable from the CLI - "imagen --model {str(path)} \"<prompt>\""')
+            save_obj = {
+                **save_obj,
+                'imagen_type': 'elucidated' if self.is_elucidated else 'original',
+                'imagen_params': self.imagen._config
+            }
+        # save to path
+        with fs.open(path, 'wb') as f:
+            torch.save(save_obj, f)
+        self.print(f'checkpoint saved to {path}')
+    def load(self, path, only_model=False, strict=True, noop_if_not_exist=False):
+        fs = self.fs
+        if noop_if_not_exist and not fs.exists(path):
+            self.print(f'trainer checkpoint not found at {str(path)}')
+            return
+        assert fs.exists(path), f'{path} does not exist'
+        self.reset_ema_unets_all_one_device()
+        # to avoid extra GPU memory usage in main process when using Accelerate
+        with fs.open(path) as f:
+            loaded_obj = torch.load(f, map_location='cpu')
+        if version.parse(__version__) != version.parse(loaded_obj['version']):
+            self.print(
+                f'loading saved imagen at version {loaded_obj["version"]}, but current package version is {__version__}')
+        try:
+            self.imagen.load_state_dict(loaded_obj['model'], strict=strict)
+        except RuntimeError:
+            print("Failed loading state dict. Trying partial load")
+            self.imagen.load_state_dict(restore_parts(self.imagen.state_dict(),
+                                                      loaded_obj['model']))
+        if only_model:
+            return loaded_obj
+        self.steps.copy_(loaded_obj['steps'])
+        for ind in range(0, self.num_unets):
+            scaler_key = f'scaler{ind}'
+            optimizer_key = f'optim{ind}'
+            scheduler_key = f'scheduler{ind}'
+            warmup_scheduler_key = f'warmup{ind}'
+            scaler = getattr(self, scaler_key)
+            optimizer = getattr(self, optimizer_key)
+            scheduler = getattr(self, scheduler_key)
+            warmup_scheduler = getattr(self, warmup_scheduler_key)
+            if exists(scheduler) and scheduler_key in loaded_obj:
+                scheduler.load_state_dict(loaded_obj[scheduler_key])
+            if exists(warmup_scheduler) and warmup_scheduler_key in loaded_obj:
+                warmup_scheduler.load_state_dict(loaded_obj[warmup_scheduler_key])
+            if exists(optimizer):
+                try:
+                    optimizer.load_state_dict(loaded_obj[optimizer_key])
+                    scaler.load_state_dict(loaded_obj[scaler_key])
+                except:
+                    self.print(
+                        'could not load optimizer and scaler, possibly because you have turned on mixed precision training since the last run. resuming with new optimizer and scalers')
+        if self.use_ema:
+            assert 'ema' in loaded_obj
+            try:
+                self.ema_unets.load_state_dict(loaded_obj['ema'], strict=strict)
+            except RuntimeError:
+                print("Failed loading state dict. Trying partial load")
+                self.ema_unets.load_state_dict(restore_parts(self.ema_unets.state_dict(),
+                                                             loaded_obj['ema']))
+        self.print(f'checkpoint loaded from {path}')
+        return loaded_obj
+    # managing ema unets and their devices
+    @property
+    def unets(self):
+        return nn.ModuleList([ema.ema_model for ema in self.ema_unets])
+    def get_ema_unet(self, unet_number=None):
+        if not self.use_ema:
+            return
+        unet_number = self.validate_unet_number(unet_number)
+        index = unet_number - 1
+        if isinstance(self.unets, nn.ModuleList):
+            unets_list = [unet for unet in self.ema_unets]
+            delattr(self, 'ema_unets')
+            self.ema_unets = unets_list
+        if index != self.ema_unet_being_trained_index:
+            for unet_index, unet in enumerate(self.ema_unets):
+                unet.to(self.device if unet_index == index else 'cpu')
+        self.ema_unet_being_trained_index = index
+        return self.ema_unets[index]
+    def reset_ema_unets_all_one_device(self, device=None):
+        if not self.use_ema:
+            return
+        device = default(device, self.device)
+        self.ema_unets = nn.ModuleList([*self.ema_unets])
+        self.ema_unets.to(device)
+        self.ema_unet_being_trained_index = -1
+    @torch.no_grad()
+    @contextmanager
+    def use_ema_unets(self):
+        if not self.use_ema:
+            output = yield
+            return output
+        self.reset_ema_unets_all_one_device()
+        self.imagen.reset_unets_all_one_device()
+        self.unets.eval()
+        trainable_unets = self.imagen.unets
+        self.imagen.unets = self.unets                  # swap in exponential moving averaged unets for sampling
+        output = yield
+        self.imagen.unets = trainable_unets             # restore original training unets
+        # cast the ema_model unets back to original device
+        for ema in self.ema_unets:
+            ema.restore_ema_model_device()
+        return output
+    def print_unet_devices(self):
+        self.print('unet devices:')
+        for i, unet in enumerate(self.imagen.unets):
+            device = next(unet.parameters()).device
+            self.print(f'\tunet {i}: {device}')
+        if not self.use_ema:
+            return
+        self.print('\nema unet devices:')
+        for i, ema_unet in enumerate(self.ema_unets):
+            device = next(ema_unet.parameters()).device
+            self.print(f'\tema unet {i}: {device}')
+    # overriding state dict functions
+    def state_dict(self, *args, **kwargs):
+        self.reset_ema_unets_all_one_device()
+        return super().state_dict(*args, **kwargs)
+    def load_state_dict(self, *args, **kwargs):
+        self.reset_ema_unets_all_one_device()
+        return super().load_state_dict(*args, **kwargs)
+    # encoding text functions
+    def encode_text(self, text, **kwargs):
+        return self.imagen.encode_text(text, **kwargs)
+    # forwarding functions and gradient step updates
+    def update(self, unet_number=None):
+        unet_number = self.validate_unet_number(unet_number)
+        self.validate_and_set_unet_being_trained(unet_number)
+        self.set_accelerator_scaler(unet_number)
+        index = unet_number - 1
+        unet = self.unet_being_trained
+        optimizer = getattr(self, f'optim{index}')
+        scaler = getattr(self, f'scaler{index}')
+        scheduler = getattr(self, f'scheduler{index}')
+        warmup_scheduler = getattr(self, f'warmup{index}')
+        # set the grad scaler on the accelerator, since we are managing one per u-net
+        if exists(self.max_grad_norm):
+            self.accelerator.clip_grad_norm_(unet.parameters(), self.max_grad_norm)
+        optimizer.step()
+        optimizer.zero_grad()
+        if self.use_ema:
+            ema_unet = self.get_ema_unet(unet_number)
+            ema_unet.update()
+        # scheduler, if needed
+        maybe_warmup_context = nullcontext() if not exists(warmup_scheduler) else warmup_scheduler.dampening()
+        with maybe_warmup_context:
+            if exists(scheduler) and not self.accelerator.optimizer_step_was_skipped:  # recommended in the docs
+                scheduler.step()
+        self.steps += F.one_hot(torch.tensor(unet_number - 1, device=self.steps.device), num_classes=len(self.steps))
+        if not exists(self.checkpoint_path):
+            return
+        total_steps = int(self.steps.sum().item())
+        if total_steps % self.checkpoint_every:
+            return
+        self.save_to_checkpoint_folder()
+    @torch.no_grad()
+    @cast_torch_tensor
+    @imagen_sample_in_chunks
+    def sample(self, *args, **kwargs):
+        context = nullcontext if kwargs.pop('use_non_ema', False) else self.use_ema_unets
+        self.print_untrained_unets()
+        if not self.is_main:
+            kwargs['use_tqdm'] = False
+        with context():
+            output = self.imagen.sample(*args, device=self.device, **kwargs)
+        return output
+    @partial(cast_torch_tensor, cast_fp16=True)
+    def forward(
+        self,
+        *args,
+        unet_number=None,
+        max_batch_size=None,
+        **kwargs
+    ):
+        unet_number = self.validate_unet_number(unet_number)
+        self.validate_and_set_unet_being_trained(unet_number)
+        self.set_accelerator_scaler(unet_number)
+        assert not exists(
+            self.only_train_unet_number) or self.only_train_unet_number == unet_number, f'you can only train unet #{self.only_train_unet_number}'
+        total_loss = 0.
+        for chunk_size_frac, (chunked_args, chunked_kwargs) in split_args_and_kwargs(*args, split_size=max_batch_size, **kwargs):
+            with self.accelerator.autocast():
+                loss = self.imagen(*chunked_args, unet=self.unet_being_trained,
+                                   unet_number=unet_number, **chunked_kwargs)
+                loss = loss * chunk_size_frac
+            total_loss += loss.item()
+            if self.training:
+                self.accelerator.backward(loss)
+        return total_loss
+class JointImagenTrainer(nn.Module):
+    locked = False
+    def __init__(
+        self,
+        imagen=None,
+        imagen_checkpoint_path=None,
+        use_ema=True,
+        lr=1e-4,
+        eps=1e-8,
+        beta1=0.9,
+        beta2=0.99,
+        max_grad_norm=None,
+        group_wd_params=True,
+        warmup_steps=None,
+        cosine_decay_max_steps=None,
+        only_train_unet_number=None,
+        fp16=False,
+        precision=None,
+        split_batches=True,
+        dl_tuple_output_keywords_names=('images', 'labels', 'texts'),
+        verbose=True,
+        split_valid_fraction=0.025,
+        split_valid_from_train=False,
+        split_random_seed=42,
+        checkpoint_path=None,
+        checkpoint_every=None,
+        checkpoint_fs=None,
+        fs_kwargs: dict = None,
+        max_checkpoints_keep=20,
+        lambdas=(1., 1.),  # lambdas for image / label losses
+        **kwargs
+    ):
+        super().__init__()
+        assert not JointImagenTrainer.locked, 'JointImagenTrainer can only be initialized once per process - for the sake of distributed training, you will now have to create a separate script to train each unet (or a script that accepts unet number as an argument)'
+        assert exists(imagen) ^ exists(
+            imagen_checkpoint_path), 'either imagen instance is passed into the trainer, or a checkpoint path that contains the imagen config'
+        # save lambdas for backward
+        self.lambdas = lambdas
+        # determine filesystem, using fsspec, for saving to local filesystem or cloud
+        self.fs = checkpoint_fs
+        if not exists(self.fs):
+            fs_kwargs = default(fs_kwargs, {})
+            self.fs, _ = url_to_fs(default(checkpoint_path, './'), **fs_kwargs)
+        assert isinstance(imagen, (JointImagen, ))  # ElucidatedImagen is not implemented yet
+        ema_kwargs, kwargs = groupby_prefix_and_trim('ema_', kwargs)
+        # elucidated or not
+        self.is_elucidated = isinstance(imagen, ElucidatedImagen)
+        # create accelerator instance
+        accelerate_kwargs, kwargs = groupby_prefix_and_trim('accelerate_', kwargs)
+        assert not (fp16 and exists(precision)
+                    ), 'either set fp16 = True or forward the precision ("fp16", "bf16") to Accelerator'
+        accelerator_mixed_precision = default(precision, 'fp16' if fp16 else 'no')
+        self.accelerator = Accelerator(**{
+            'split_batches': split_batches,
+            'mixed_precision': accelerator_mixed_precision,
+            'kwargs_handlers': [DistributedDataParallelKwargs(find_unused_parameters=True)], **accelerate_kwargs})
+        JointImagenTrainer.locked = self.is_distributed
+        # cast data to fp16 at training time if needed
+        self.cast_half_at_training = accelerator_mixed_precision == 'fp16'
+        # grad scaler must be managed outside of accelerator
+        grad_scaler_enabled = fp16
+        # imagen, unets and ema unets
+        self.imagen = imagen
+        self.num_unets = len(self.imagen.unets)
+        self.use_ema = use_ema and self.is_main
+        self.ema_unets = nn.ModuleList([])
+        # keep track of what unet is being trained on
+        # only going to allow 1 unet training at a time
+        self.ema_unet_being_trained_index = -1  # keeps track of which ema unet is being trained on
+        # data related functions
+        self.train_dl_iter = None
+        self.train_dl = None
+        self.valid_dl_iter = None
+        self.valid_dl = None
+        self.dl_tuple_output_keywords_names = dl_tuple_output_keywords_names
+        # auto splitting validation from training, if dataset is passed in
+        self.split_valid_from_train = split_valid_from_train
+        assert 0 <= split_valid_fraction <= 1, 'split valid fraction must be between 0 and 1'
+        self.split_valid_fraction = split_valid_fraction
+        self.split_random_seed = split_random_seed
+        # be able to finely customize learning rate, weight decay
+        # per unet
+        lr, eps, warmup_steps, cosine_decay_max_steps = map(
+            partial(cast_tuple, length=self.num_unets), (lr, eps, warmup_steps, cosine_decay_max_steps))
+        for ind, (unet, unet_lr, unet_eps, unet_warmup_steps, unet_cosine_decay_max_steps) in enumerate(zip(self.imagen.unets, lr, eps, warmup_steps, cosine_decay_max_steps)):
+            optimizer = Adam(
+                unet.parameters(),
+                lr=unet_lr,
+                eps=unet_eps,
+                betas=(beta1, beta2),
+                **kwargs
+            )
+            if self.use_ema:
+                self.ema_unets.append(EMA(unet, **ema_kwargs))
+            scaler = GradScaler(enabled=grad_scaler_enabled)
+            scheduler = warmup_scheduler = None
+            if exists(unet_cosine_decay_max_steps):
+                scheduler = CosineAnnealingLR(optimizer, T_max=unet_cosine_decay_max_steps)
+            if exists(unet_warmup_steps):
+                warmup_scheduler = warmup.LinearWarmup(optimizer, warmup_period=unet_warmup_steps)
+                if not exists(scheduler):
+                    scheduler = LambdaLR(optimizer, lr_lambda=lambda step: 1.0)
+            # set on object
+            setattr(self, f'optim{ind}', optimizer)  # cannot use pytorch ModuleList for some reason with optimizers
+            setattr(self, f'scaler{ind}', scaler)
+            setattr(self, f'scheduler{ind}', scheduler)
+            setattr(self, f'warmup{ind}', warmup_scheduler)
+        # gradient clipping if needed
+        self.max_grad_norm = max_grad_norm
+        # step tracker and misc
+        self.register_buffer('steps', torch.tensor([0] * self.num_unets))
+        self.verbose = verbose
+        # automatic set devices based on what accelerator decided
+        self.imagen.to(self.device)
+        self.to(self.device)
+        # checkpointing
+        assert not (exists(checkpoint_path) ^ exists(checkpoint_every))
+        self.checkpoint_path = checkpoint_path
+        self.checkpoint_every = checkpoint_every
+        self.max_checkpoints_keep = max_checkpoints_keep
+        self.can_checkpoint = self.is_local_main if isinstance(checkpoint_fs, LocalFileSystem) else self.is_main
+        if exists(checkpoint_path) and self.can_checkpoint:
+            bucket = url_to_bucket(checkpoint_path)
+            if not self.fs.exists(bucket):
+                self.fs.mkdir(bucket)
+            self.load_from_checkpoint_folder()
+        # only allowing training for unet
+        self.only_train_unet_number = only_train_unet_number
+        self.validate_and_set_unet_being_trained(only_train_unet_number)
+    # computed values
+    @property
+    def device(self):
+        return self.accelerator.device
+    @property
+    def is_distributed(self):
+        return not (self.accelerator.distributed_type == DistributedType.NO and self.accelerator.num_processes == 1)
+    @property
+    def is_main(self):
+        return self.accelerator.is_main_process
+    @property
+    def is_local_main(self):
+        return self.accelerator.is_local_main_process
+    @property
+    def unwrapped_unet(self):
+        return self.accelerator.unwrap_model(self.unet_being_trained)
+    # optimizer helper functions
+    def get_lr(self, unet_number):
+        self.validate_unet_number(unet_number)
+        unet_index = unet_number - 1
+        optim = getattr(self, f'optim{unet_index}')
+        return optim.param_groups[0]['lr']
+    # function for allowing only one unet from being trained at a time
+    def validate_and_set_unet_being_trained(self, unet_number=None):
+        if exists(unet_number):
+            self.validate_unet_number(unet_number)
+        assert not exists(self.only_train_unet_number) or self.only_train_unet_number == unet_number, 'you cannot only train on one unet at a time. you will need to save the trainer into a checkpoint, and resume training on a new unet'
+        self.only_train_unet_number = unet_number
+        self.imagen.only_train_unet_number = unet_number
+        if not exists(unet_number):
+            return
+        self.wrap_unet(unet_number)
+    def wrap_unet(self, unet_number):
+        if hasattr(self, 'one_unet_wrapped'):
+            return
+        unet = self.imagen.get_unet(unet_number)
+        self.unet_being_trained = self.accelerator.prepare(unet)
+        unet_index = unet_number - 1
+        optimizer = getattr(self, f'optim{unet_index}')
+        scheduler = getattr(self, f'scheduler{unet_index}')
+        optimizer = self.accelerator.prepare(optimizer)
+        if exists(scheduler):
+            scheduler = self.accelerator.prepare(scheduler)
+        setattr(self, f'optim{unet_index}', optimizer)
+        setattr(self, f'scheduler{unet_index}', scheduler)
+        self.one_unet_wrapped = True
+    # hacking accelerator due to not having separate gradscaler per optimizer
+    def set_accelerator_scaler(self, unet_number):
+        unet_number = self.validate_unet_number(unet_number)
+        scaler = getattr(self, f'scaler{unet_number - 1}')
+        self.accelerator.scaler = scaler
+        for optimizer in self.accelerator._optimizers:
+            optimizer.scaler = scaler
+    # helper print
+    def print(self, msg):
+        if not self.is_main:
+            return
+        if not self.verbose:
+            return
+        return self.accelerator.print(msg)
+    # validating the unet number
+    def validate_unet_number(self, unet_number=None):
+        if self.num_unets == 1:
+            unet_number = default(unet_number, 1)
+        assert 0 < unet_number <= self.num_unets, f'unet number should be in between 1 and {self.num_unets}'
+        return unet_number
+    # number of training steps taken
+    def num_steps_taken(self, unet_number=None):
+        if self.num_unets == 1:
+            unet_number = default(unet_number, 1)
+        return self.steps[unet_number - 1].item()
+    def print_untrained_unets(self):
+        print_final_error = False
+        for ind, (steps, unet) in enumerate(zip(self.steps.tolist(), self.imagen.unets)):
+            if steps > 0 or isinstance(unet, NullUnet):
+                continue
+            self.print(f'unet {ind + 1} has not been trained')
+            print_final_error = True
+        if print_final_error:
+            self.print(
+                'when sampling, you can pass stop_at_unet_number to stop early in the cascade, so it does not try to generate with untrained unets')
+    # data related functions
+    def add_train_dataloader(self, dl=None):
+        if not exists(dl):
+            return
+        assert not exists(self.train_dl), 'training dataloader was already added'
+        self.train_dl = self.accelerator.prepare(dl)
+    def add_valid_dataloader(self, dl):
+        if not exists(dl):
+            return
+        assert not exists(self.valid_dl), 'validation dataloader was already added'
+        self.valid_dl = self.accelerator.prepare(dl)
+    def add_train_dataset(self, ds=None, *, batch_size, **dl_kwargs):
+        if not exists(ds):
+            return
+        assert not exists(self.train_dl), 'training dataloader was already added'
+        valid_ds = None
+        if self.split_valid_from_train:
+            train_size = int((1 - self.split_valid_fraction) * len(ds))
+            valid_size = len(ds) - train_size
+            ds, valid_ds = random_split(ds, [train_size, valid_size],
+                                        generator=torch.Generator().manual_seed(self.split_random_seed))
+            self.print(
+                f'training with dataset of {len(ds)} samples and validating with randomly splitted {len(valid_ds)} samples')
+        dl = DataLoader(ds, batch_size=batch_size, **dl_kwargs)
+        self.train_dl = self.accelerator.prepare(dl)
+        if not self.split_valid_from_train:
+            return
+        self.add_valid_dataset(valid_ds, batch_size=batch_size, **dl_kwargs)
+    def add_valid_dataset(self, ds, *, batch_size, **dl_kwargs):
+        if not exists(ds):
+            return
+        assert not exists(self.valid_dl), 'validation dataloader was already added'
+        dl = DataLoader(ds, batch_size=batch_size, **dl_kwargs)
+        self.valid_dl = self.accelerator.prepare(dl)
+    def create_train_iter(self):
+        assert exists(self.train_dl), 'training dataloader has not been registered with the trainer yet'
+        if exists(self.train_dl_iter):
+            return
+        self.train_dl_iter = cycle(self.train_dl)
+    def create_valid_iter(self):
+        assert exists(self.valid_dl), 'validation dataloader has not been registered with the trainer yet'
+        if exists(self.valid_dl_iter):
+            return
+        self.valid_dl_iter = cycle(self.valid_dl)
+    def train_step(self, unet_number=None, **kwargs):
+        self.create_train_iter()
+        loss = self.step_with_dl_iter(self.train_dl_iter, unet_number=unet_number, **kwargs)
+        self.update(unet_number=unet_number)
+        return loss
+    @torch.no_grad()
+    @eval_decorator
+    def valid_step(self, **kwargs):
+        self.create_valid_iter()
+        context = self.use_ema_unets if kwargs.pop('use_ema_unets', False) else nullcontext
+        with context():
+            loss = self.step_with_dl_iter(self.valid_dl_iter, **kwargs)
+        return loss
+    def step_with_dl_iter(self, dl_iter, **kwargs):
+        dl_tuple_output = cast_tuple(next(dl_iter))
+        model_input = dict(list(zip(self.dl_tuple_output_keywords_names, dl_tuple_output)))
+        loss = self.forward(**{**kwargs, **model_input})
+        return loss
+    # checkpointing functions
+    @property
+    def all_checkpoints_sorted(self):
+        glob_pattern = os.path.join(self.checkpoint_path, '*.pt')
+        checkpoints = self.fs.glob(glob_pattern)
+        sorted_checkpoints = sorted(checkpoints, key=lambda x: int(str(x).split('.')[-2]), reverse=True)
+        return sorted_checkpoints
+    def load_from_checkpoint_folder(self, last_total_steps=-1):
+        if last_total_steps != -1:
+            filepath = os.path.join(self.checkpoint_path, f'checkpoint.{last_total_steps}.pt')
+            self.load(filepath)
+            return
+        sorted_checkpoints = self.all_checkpoints_sorted
+        if len(sorted_checkpoints) == 0:
+            self.print(f'no checkpoints found to load from at {self.checkpoint_path}')
+            return
+        last_checkpoint = sorted_checkpoints[0]
+        self.load(last_checkpoint)
+    def save_to_checkpoint_folder(self):
+        self.accelerator.wait_for_everyone()
+        if not self.can_checkpoint:
+            return
+        total_steps = int(self.steps.sum().item())
+        filepath = os.path.join(self.checkpoint_path, f'checkpoint.{total_steps}.pt')
+        self.save(filepath)
+        if self.max_checkpoints_keep <= 0:
+            return
+        sorted_checkpoints = self.all_checkpoints_sorted
+        checkpoints_to_discard = sorted_checkpoints[self.max_checkpoints_keep:]
+        for checkpoint in checkpoints_to_discard:
+            self.fs.rm(checkpoint)
+    # saving and loading functions
+    def save(
+        self,
+        path,
+        overwrite=True,
+        without_optim_and_sched=False,
+        **kwargs
+    ):
+        # self.accelerator.wait_for_everyone()
+        if not self.can_checkpoint:
+            return
+        fs = self.fs
+        assert not (fs.exists(path) and not overwrite)
+        self.reset_ema_unets_all_one_device()
+        save_obj = dict(
+            model=self.imagen.state_dict(),
+            version=__version__,
+            steps=self.steps.cpu(),
+            **kwargs
+        )
+        save_optim_and_sched_iter = range(0, self.num_unets) if not without_optim_and_sched else tuple()
+        for ind in save_optim_and_sched_iter:
+            scaler_key = f'scaler{ind}'
+            optimizer_key = f'optim{ind}'
+            scheduler_key = f'scheduler{ind}'
+            warmup_scheduler_key = f'warmup{ind}'
+            scaler = getattr(self, scaler_key)
+            optimizer = getattr(self, optimizer_key)
+            scheduler = getattr(self, scheduler_key)
+            warmup_scheduler = getattr(self, warmup_scheduler_key)
+            if exists(scheduler):
+                save_obj = {**save_obj, scheduler_key: scheduler.state_dict()}
+            if exists(warmup_scheduler):
+                save_obj = {**save_obj, warmup_scheduler_key: warmup_scheduler.state_dict()}
+            save_obj = {**save_obj, scaler_key: scaler.state_dict(), optimizer_key: optimizer.state_dict()}
+        if self.use_ema:
+            save_obj = {**save_obj, 'ema': self.ema_unets.state_dict()}
+        # determine if imagen config is available
+        if hasattr(self.imagen, '_config'):
+            self.print(f'this checkpoint is commandable from the CLI - "imagen --model {str(path)} \"<prompt>\""')
+            save_obj = {
+                **save_obj,
+                'imagen_type': 'elucidated' if self.is_elucidated else 'original',
+                'imagen_params': self.imagen._config
+            }
+        # save to path
+        with fs.open(path, 'wb') as f:
+            torch.save(save_obj, f)
+        self.print(f'checkpoint saved to {path}')
+    def load(self, path, only_model=False, strict=True, noop_if_not_exist=False):
+        fs = self.fs
+        if noop_if_not_exist and not fs.exists(path):
+            self.print(f'trainer checkpoint not found at {str(path)}')
+            return
+        assert fs.exists(path), f'{path} does not exist'
+        self.reset_ema_unets_all_one_device()
+        # to avoid extra GPU memory usage in main process when using Accelerate
+        with fs.open(path) as f:
+            loaded_obj = torch.load(f, map_location='cpu')
+        if version.parse(__version__) != version.parse(loaded_obj['version']):
+            self.print(f'loading saved imagen at version {loaded_obj["version"]}, '
+                       f'but current package version is {__version__}')
+        try:
+            self.imagen.load_state_dict(loaded_obj['model'], strict=strict)
+        except RuntimeError:
+            print("Failed loading state dict. Trying partial load")
+            self.imagen.load_state_dict(restore_parts(self.imagen.state_dict(),
+                                                      loaded_obj['model']))
+        if only_model:
+            return loaded_obj
+        self.steps.copy_(loaded_obj['steps'])
+        for ind in range(0, self.num_unets):
+            scaler_key = f'scaler{ind}'
+            optimizer_key = f'optim{ind}'
+            scheduler_key = f'scheduler{ind}'
+            warmup_scheduler_key = f'warmup{ind}'
+            scaler = getattr(self, scaler_key)
+            optimizer = getattr(self, optimizer_key)
+            scheduler = getattr(self, scheduler_key)
+            warmup_scheduler = getattr(self, warmup_scheduler_key)
+            if exists(scheduler) and scheduler_key in loaded_obj:
+                scheduler.load_state_dict(loaded_obj[scheduler_key])
+            if exists(warmup_scheduler) and warmup_scheduler_key in loaded_obj:
+                warmup_scheduler.load_state_dict(loaded_obj[warmup_scheduler_key])
+            if exists(optimizer):
+                try:
+                    optimizer.load_state_dict(loaded_obj[optimizer_key])
+                    scaler.load_state_dict(loaded_obj[scaler_key])
+                except:
+                    self.print('could not load optimizer and scaler, '
+                               'possibly because you have turned on mixed precision training since the last run. '
+                               'resuming with new optimizer and scalers')
+        if self.use_ema:
+            assert 'ema' in loaded_obj
+            try:
+                self.ema_unets.load_state_dict(loaded_obj['ema'], strict=strict)
+            except RuntimeError:
+                print("Failed loading state dict. Trying partial load")
+                self.ema_unets.load_state_dict(restore_parts(self.ema_unets.state_dict(),
+                                                             loaded_obj['ema']))
+        self.print(f'checkpoint loaded from {path}')
+        return loaded_obj
+    # managing ema unets and their devices
+    @property
+    def unets(self):
+        return nn.ModuleList([ema.ema_model for ema in self.ema_unets])
+    def get_ema_unet(self, unet_number=None):
+        if not self.use_ema:
+            return
+        unet_number = self.validate_unet_number(unet_number)
+        index = unet_number - 1
+        if isinstance(self.unets, nn.ModuleList):
+            unets_list = [unet for unet in self.ema_unets]
+            delattr(self, 'ema_unets')
+            self.ema_unets = unets_list
+        if index != self.ema_unet_being_trained_index:
+            for unet_index, unet in enumerate(self.ema_unets):
+                unet.to(self.device if unet_index == index else 'cpu')
+        self.ema_unet_being_trained_index = index
+        return self.ema_unets[index]
+    def reset_ema_unets_all_one_device(self, device=None):
+        if not self.use_ema:
+            return
+        device = default(device, self.device)
+        self.ema_unets = nn.ModuleList([*self.ema_unets])
+        self.ema_unets.to(device)
+        self.ema_unet_being_trained_index = -1
+    @torch.no_grad()
+    @contextmanager
+    def use_ema_unets(self):
+        if not self.use_ema:
+            output = yield
+            return output
+        self.reset_ema_unets_all_one_device()
+        self.imagen.reset_unets_all_one_device()
+        self.unets.eval()
+        trainable_unets = self.imagen.unets
+        self.imagen.unets = self.unets                  # swap in exponential moving averaged unets for sampling
+        output = yield
+        self.imagen.unets = trainable_unets             # restore original training unets
+        # cast the ema_model unets back to original device
+        for ema in self.ema_unets:
+            ema.restore_ema_model_device()
+        return output
+    def print_unet_devices(self):
+        self.print('unet devices:')
+        for i, unet in enumerate(self.imagen.unets):
+            device = next(unet.parameters()).device
+            self.print(f'\tunet {i}: {device}')
+        if not self.use_ema:
+            return
+        self.print('\nema unet devices:')
+        for i, ema_unet in enumerate(self.ema_unets):
+            device = next(ema_unet.parameters()).device
+            self.print(f'\tema unet {i}: {device}')
+    # overriding state dict functions
+    def state_dict(self, *args, **kwargs):
+        self.reset_ema_unets_all_one_device()
+        return super().state_dict(*args, **kwargs)
+    def load_state_dict(self, *args, **kwargs):
+        self.reset_ema_unets_all_one_device()
+        return super().load_state_dict(*args, **kwargs)
+    # encoding text functions
+    def encode_text(self, text, **kwargs):
+        return self.imagen.encode_text(text, **kwargs)
+    # forwarding functions and gradient step updates
+    def update(self, unet_number=None):
+        unet_number = self.validate_unet_number(unet_number)
+        self.validate_and_set_unet_being_trained(unet_number)
+        self.set_accelerator_scaler(unet_number)
+        index = unet_number - 1
+        unet = self.unet_being_trained
+        optimizer = getattr(self, f'optim{index}')
+        scaler = getattr(self, f'scaler{index}')
+        scheduler = getattr(self, f'scheduler{index}')
+        warmup_scheduler = getattr(self, f'warmup{index}')
+        # set the grad scaler on the accelerator, since we are managing one per u-net
+        if exists(self.max_grad_norm):
+            self.accelerator.clip_grad_norm_(unet.parameters(), self.max_grad_norm)
+        optimizer.step()
+        optimizer.zero_grad()
+        if self.use_ema:
+            ema_unet = self.get_ema_unet(unet_number)
+            ema_unet.update()
+        # scheduler, if needed
+        maybe_warmup_context = nullcontext() if not exists(warmup_scheduler) else warmup_scheduler.dampening()
+        with maybe_warmup_context:
+            if exists(scheduler) and not self.accelerator.optimizer_step_was_skipped:  # recommended in the docs
+                scheduler.step()
+        self.steps += F.one_hot(torch.tensor(unet_number - 1, device=self.steps.device), num_classes=len(self.steps))
+        if not exists(self.checkpoint_path):
+            return
+        total_steps = int(self.steps.sum().item())
+        if total_steps % self.checkpoint_every:
+            return
+        self.save_to_checkpoint_folder()
+    @torch.no_grad()
+    @cast_torch_tensor
+    @imagen_sample_in_chunks
+    def sample(self, *args, **kwargs):
+        context = nullcontext if kwargs.pop('use_non_ema', False) else self.use_ema_unets
+        self.print_untrained_unets()
+        if not self.is_main:
+            kwargs['use_tqdm'] = False
+        with context():
+            output = self.imagen.sample(*args, device=self.device, **kwargs)
+        return output
+    @partial(cast_torch_tensor, cast_fp16=True)
+    def forward(
+        self,
+        *args,
+        unet_number=None,
+        max_batch_size=None,
+        **kwargs
+    ):
+        unet_number = self.validate_unet_number(unet_number)
+        self.validate_and_set_unet_being_trained(unet_number)
+        self.set_accelerator_scaler(unet_number)
+        assert not exists(
+            self.only_train_unet_number) or self.only_train_unet_number == unet_number, f'you can only train unet #{self.only_train_unet_number}'
+        total_loss = 0.
+        total_loss_seg = 0.
+        for chunk_size_frac, (chunked_args, chunked_kwargs) in split_args_and_kwargs(*args, split_size=max_batch_size, **kwargs):
+            with self.accelerator.autocast():
+                loss, loss_seg = self.imagen(*chunked_args, unet=self.unet_being_trained,
+                                             unet_number=unet_number, **chunked_kwargs)
+                loss = loss * chunk_size_frac
+                loss_seg = loss_seg * chunk_size_frac
+            total_loss += loss.item()
+            total_loss_seg += loss_seg.item()
+            if self.training:
+                self.accelerator.backward(loss * self.lambdas[0] + loss_seg * self.lambdas[1])
+        return total_loss, total_loss_seg

imagen_pytorch/utils.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import torch
+from torch import nn
+from functools import reduce
+from pathlib import Path
+from imagen_pytorch.configs import ImagenConfig, ElucidatedImagenConfig
+from ema_pytorch import EMA
+def exists(val):
+    return val is not None
+def safeget(dictionary, keys, default = None):
+    return reduce(lambda d, key: d.get(key, default) if isinstance(d, dict) else default, keys.split('.'), dictionary)
+def load_imagen_from_checkpoint(
+    checkpoint_path,
+    load_weights = True,
+    load_ema_if_available = False
+):
+    model_path = Path(checkpoint_path)
+    full_model_path = str(model_path.resolve())
+    assert model_path.exists(), f'checkpoint not found at {full_model_path}'
+    loaded = torch.load(str(model_path), map_location='cpu')
+    imagen_params = safeget(loaded, 'imagen_params')
+    imagen_type = safeget(loaded, 'imagen_type')
+    if imagen_type == 'original':
+        imagen_klass = ImagenConfig
+    elif imagen_type == 'elucidated':
+        imagen_klass = ElucidatedImagenConfig
+    else:
+        raise ValueError(f'unknown imagen type {imagen_type} - you need to instantiate your Imagen with configurations, using classes ImagenConfig or ElucidatedImagenConfig')
+    assert exists(imagen_params) and exists(imagen_type), 'imagen type and configuration not saved in this checkpoint'
+    imagen = imagen_klass(**imagen_params).create()
+    if not load_weights:
+        return imagen
+    has_ema = 'ema' in loaded
+    should_load_ema = has_ema and load_ema_if_available
+    imagen.load_state_dict(loaded['model'])
+    if not should_load_ema:
+        print('loading non-EMA version of unets')
+        return imagen
+    ema_unets = nn.ModuleList([])
+    for unet in imagen.unets:
+        ema_unets.append(EMA(unet))
+    ema_unets.load_state_dict(loaded['ema'])
+    for unet, ema_unet in zip(imagen.unets, ema_unets):
+        unet.load_state_dict(ema_unet.ema_model.state_dict())
+    print('loaded EMA version of unets')
+    return imagen

imagen_pytorch/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = '1.11.14'

pyproject.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[tool.autopep8]
+max_line_length = 120
+ignore = ["E402"]

repaint/LICENSES/LICENSE ADDED Viewed

	@@ -0,0 +1,13 @@

+Copyright (c) 2022 Huawei Technologies Co., Ltd.
+Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.

repaint/LICENSES/LICENSE_guided_diffusion ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 OpenAI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

repaint/LICENSES/README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# License and Acknowledgement
+A big thanks to following contributes that open sourced their code and therefore helped us a lot in developing RePaint!
+This repository was forked from:
+https://github.com/openai/guided-diffusion
+It contains code from:
+https://github.com/hojonathanho/diffusion
+If we missed a contribution, please contact us.

repaint/README.md ADDED Viewed

	@@ -0,0 +1,205 @@

+# RePaint
+**Inpainting using Denoising Diffusion Probabilistic Models**
+CVPR 2022 [[Paper]](https://bit.ly/3b1ABEb)
+[![Denoising_Diffusion_Inpainting_Animation](https://user-images.githubusercontent.com/11280511/150849757-5cd762cb-07a3-46aa-a906-0fe4606eba3b.gif)](#)
+## Setup
+### 1. Code
+```bash
+git clone https://github.com/andreas128/RePaint.git
+```
+### 2. Environment
+```bash
+pip install numpy torch blobfile tqdm pyYaml pillow    # e.g. torch 1.7.1+cu110.
+```
+### 3. Download models and data
+```bash
+pip install --upgrade gdown && bash ./download.sh
+```
+That downloads the models for ImageNet, CelebA-HQ, and Places2, as well as the face example and example masks.
+### 4. Run example
+```bash
+python test.py --conf_path confs/face_example.yml
+```
+Find the output in `./log/face_example/inpainted`
+*Note: After refactoring the code, we did not reevaluate all experiments.*
+<br>
+# RePaint fills a missing image part using diffusion models
+<table border="0" cellspacing="0" cellpadding="0">
+  <tr>
+    <td><img alt="RePaint Inpainting using Denoising Diffusion Probabilistic Models Demo 1" src="https://user-images.githubusercontent.com/11280511/150766080-9f3d7bc9-99f2-472e-9e5d-b6ed456340d1.gif"></td>
+        <td><img alt="RePaint Inpainting using Denoising Diffusion Probabilistic Models Demo 2" src="https://user-images.githubusercontent.com/11280511/150766125-adf5a3cb-17f2-432c-a8f6-ce0b97122819.gif"></td>
+  </tr>
+</table>
+**What are the blue parts?** <br>
+Those parts are missing and therefore have to be filled by RePaint. <br> RePaint generates the missing parts inspired by the known parts.
+**How does it work?** <br>
+RePaint starts from pure noise. Then the image is denoised step-by-step.  <br> It uses the known part to fill the unknown part in each step.
+**Why does the noise level fluctuate during generation?** <br>
+Our noise schedule improves the harmony between the generated and <br> the known part [[4.2 Resampling]](https://bit.ly/3b1ABEb).
+<br>
+## Details on data
+**Which datasets and masks have a ready-to-use config file?**
+We provide config files for ImageNet (inet256), CelebA-HQ (c256) and Places2 (p256) for the masks "thin", "thick", "every second line", "super-resolution", "expand" and "half" in [`./confs`](https://github.com/andreas128/RePaint/tree/main/confs). You can use them as shown in the example above.
+**How to prepare the test data?**
+We use [LaMa](https://github.com/saic-mdal/lama) for validation and testing. Follow their instructions and add the images as specified in the config files. When you download the data using `download.sh`, you can see examples of masks we used.
+**How to apply it to other images?**
+Copy the config file for the dataset that matches your data best (for faces aligned like CelebA-HQ `_c256`, for diverse images `_inet256`). Then set the [`gt_path`](https://github.com/andreas128/RePaint/blob/0fea066b52346c331cdf1bf7aed616c8c8896714/confs/face_example.yml#L70) and [`mask_path`](https://github.com/andreas128/RePaint/blob/0fea066b52346c331cdf1bf7aed616c8c8896714/confs/face_example.yml#L71) to where your input is. The masks have the value 255 for known regions and 0 for unknown areas (the ones that get generated).
+**How to apply it for other datasets?**
+If you work with other data than faces, places or general images, train a model using the [guided-diffusion](https://github.com/openai/guided-diffusion) repository. Note that RePaint is an inference scheme. We do not train or finetune the diffusion model but condition pre-trained models.
+## Adapt the code
+**How to design a new schedule?**
+Fill in your own parameters in this [line](https://github.com/andreas128/RePaint/blob/0fea066b52346c331cdf1bf7aed616c8c8896714/guided_diffusion/scheduler.py#L180) to visualize the schedule using `python guided_diffusion/scheduler.py`. Then copy a config file, set your parameters in these [lines](https://github.com/andreas128/RePaint/blob/0fea066b52346c331cdf1bf7aed616c8c8896714/confs/face_example.yml#L61-L65) and run the inference using `python test.py --conf_path confs/my_schedule.yml`.
+**How to speed up the inference?**
+The following settings are in the [schedule_jump_params](https://github.com/andreas128/RePaint/blob/0fea066b52346c331cdf1bf7aed616c8c8896714/confs/face_example.yml#L61) key in the config files. You can visualize them as described above.
+- Reduce `t_T`, the total number of steps (without resampling). The lower it is, the more noise gets removed per step.
+- Reduce `jump_n_sample` to resample fewer times.
+- Apply resampling not from the beginning but only after a specific time by setting `start_resampling`.
+## Code overview
+- **Schedule:** The list of diffusion times t which will be traversed are obtained in this [line](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L503). e.g. times = [249, 248, 249, 248, 247, 248, 247, 248, 247, 246, ...]
+- **Denoise:** Reverse diffusion steps from x<sub>t</sub> (more noise) to a x<sub>t-1</sub> (less noisy) are done below this [line](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L515).
+- **Predict:** The model is called [here](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L237) and obtains x<sub>t</sub> and the time t to predict a tensor with 6 channels containing information about the mean and variance of x<sub>t-1</sub>. Then the value range of the variance is adjusted [here](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L252). The mean of x<sub>t-1</sub> is obtained by the weighted sum of the estimated [x<sub>0</sub>](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L270) and x<sub>t</sub> [here](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L189). The obtained mean and variance is used [here](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L402) to sample x<sub>t-1</sub>. (This is the original reverse step from [guided-diffusion](https://github.com/openai/guided-diffusion.git). )
+- **Condition:** The known part of the input image needs to have the same amount of noise as the part that the diffusion model generates to join them. The required amount of noise is calculated [here](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L368) and added to the known part [here](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L371). The generated and sampled parts get joined using a maks [here](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L373).
+- **Undo:** The forward diffusion steps from x<sub>t-1</sub> to x<sub>t</sub> is done after this [line](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L536). The noise gets added to x<sub>t-1</sub> [here](https://github.com/andreas128/RePaint/blob/76cb5b49d3f28715980f6e809c6859b148be9867/guided_diffusion/gaussian_diffusion.py#L176).
+## Issues
+**Do you have further questions?**
+Please open an [issue](https://github.com/andreas128/RePaint/issues), and we will try to help you.
+**Did you find a mistake?**
+Please create a pull request. For examply by clicking the pencil button on the top right on the github page.
+<br>
+# RePaint on diverse content and shapes of missing regions
+The blue region is unknown and filled by RePaint:
+![Denoising Diffusion Probabilistic Models Inpainting](https://user-images.githubusercontent.com/11280511/150803812-a4729ef8-6ad4-46aa-ae99-8c27fbb2ea2e.png)
+**Note: RePaint creates many meaningful fillings.** <br>
+1) **Face:** Expressions and features like an earring or a mole. <br>
+2) **Computer:** The computer screen shows different images, text, and even a logo. <br>
+3) **Greens:** RePaint makes sense of the tiny known part and incorporates it in a beetle, spaghetti, and plants. <br>
+4) **Garden:** From simple filling like a curtain to complex filling like a human. <br>
+<br>
+# Extreme Case 1: Generate every second line
+![Denoising_Diffusion_Probabilistic_Models_Inpainting_Every_Second_Line](https://user-images.githubusercontent.com/11280511/150818064-29789cbe-73c7-45de-a955-9fad5fb24c0e.png)
+- Every Second line of the input image is unknown.
+- Most inpainting methods fail on such masks.
+<br>
+# Extreme Case 2: Upscale an image
+![Denoising_Diffusion_Probabilistic_Models_Inpainting_Super_Resolution](https://user-images.githubusercontent.com/11280511/150818741-5ed19a0b-1cf8-4f28-9e57-2e4c12303c3e.png)
+- The inpainting only knows pixels with a stridden access of 2.
+- A ratio of 3/4 of the image has to be filled.
+- This is equivalent to Super-Resolution with the Nearest Neighbor kernel.
+<br>
+# RePaint conditions the diffusion model on the known part
+- RePaint uses unconditionally trained Denoising Diffusion Probabilistic Models.
+- We condition during inference on the given image content.
+![Denoising Diffusion Probabilistic Models Inpainting Method](https://user-images.githubusercontent.com/11280511/180631151-59b6674b-bf2c-4501-8307-03c9f5f593ae.gif)
+**Intuition of one conditioned denoising step:**
+1) **Sample the known part:** Add gaussian noise to the known regions of the image. <br> We obtain a noisy image that follows the denoising process exactly.
+2) **Denoise one step:** Denoise the previous image for one step. This generates  <br> content for the unknown region conditioned on the known region.
+3) **Join:** Merge the images from both steps.
+Details are in Algorithm 1 on Page 5. [[Paper]](https://bit.ly/3b1ABEb)
+<br>
+# How to harmonize the generated with the known part?
+- **Fail:** When using only the algorithm above, the filling is not well harmonized with the known part (n=1).
+- **Fix:** When applying the [[4.2 Resampling]](https://bit.ly/3b1ABEb) technique, the images are better harmonized (n>1).
+<img width="1577" alt="Diffusion Model Resampling" src="https://user-images.githubusercontent.com/11280511/150822917-737c00b0-b6bb-439d-a5bf-e73238d30990.png">
+<br>
+# RePaint Fails
+- The ImageNet model is biased towards inpainting dogs.
+- This is due to the high ratio of dog images in ImageNet.
+<img width="1653" alt="RePaint Fails" src="https://user-images.githubusercontent.com/11280511/150853163-b965f59c-5ad4-485b-816e-4391e77b5199.png">
+<br>
+# User Study State-of-the-Art Comparison
+- Outperforms autoregression-based and GAN-based SOTA methods, <br> with 95% significance for all masks except for two inconclusive cases.
+- The user study was done for six different masks on three datasets.
+- RePaint outperformed SOTA methods in 42 of 44 cases. [[Paper]](https://bit.ly/3b1ABEb)
+<br>
+# Explore the Visual Examples
+- Datasets: CelebA-HQ, ImageNet, Places2
+- Masks: Random strokes, half image, huge, sparse
+- Explore more examples like this in the [[Appendix]](https://bit.ly/3b1ABEb).
+<img width="1556" alt="Denosing Diffusion Inpainting Examples" src="https://user-images.githubusercontent.com/11280511/150864677-0eb482ae-c114-4b0b-b1e0-9be9574da307.png">
+<br>
+# Acknowledgement
+This work was supported by the ETH Zürich Fund (OK), a Huawei Technologies Oy (Finland) project, and an Nvidia GPU grant.
+This repository is based on [guided-diffuion](https://github.com/openai/guided-diffusion.git) from OpenAI.

repaint/conf_mgt/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+from conf_mgt.conf_base import Default_Conf

repaint/conf_mgt/conf_base.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+from functools import lru_cache
+import os
+import torch
+from utils import imwrite
+from collections import defaultdict
+from os.path import isfile, expanduser
+def to_file_ext(img_names, ext):
+    img_names_out = []
+    for img_name in img_names:
+        splits = img_name.split('.')
+        if not len(splits) == 2:
+            raise RuntimeError("File name needs exactly one '.':", img_name)
+        img_names_out.append(splits[0] + '.' + ext)
+    return img_names_out
+def write_images(imgs, img_names, dir_path):
+    os.makedirs(dir_path, exist_ok=True)
+    for image_name, image in zip(img_names, imgs):
+        out_path = os.path.join(dir_path, image_name)
+        imwrite(img=image, path=out_path)
+class NoneDict(defaultdict):
+    def __init__(self):
+        super().__init__(self.return_None)
+    @staticmethod
+    def return_None():
+        return None
+    def __getattr__(self, attr):
+        return self.get(attr)
+class Default_Conf(NoneDict):
+    def __init__(self):
+        pass
+    def get_dataloader(self, dset='train', dsName=None, batch_size=None, return_dataset=False):
+        if batch_size is None:
+            batch_size = self.batch_size
+        candidates = self['data'][dset]
+        ds_conf = candidates[dsName].copy()
+        if ds_conf.get('mask_loader', False):
+            from guided_diffusion.image_datasets import load_data_inpa
+            return load_data_inpa(**ds_conf, conf=self)
+        else:
+            raise NotImplementedError()
+    def get_debug_variance_path(self):
+        return os.path.expanduser(os.path.join(self.get_default_eval_conf()['paths']['root'], 'debug/debug_variance'))
+    @ staticmethod
+    def device():
+        return 'cuda' if torch.cuda.is_available() else 'cpu'
+    def eval_imswrite(self, srs=None, img_names=None, dset=None, name=None, ext='png', lrs=None, gts=None, gt_keep_masks=None, verify_same=True):
+        img_names = to_file_ext(img_names, ext)
+        if dset is None:
+            dset = self.get_default_eval_name()
+        max_len = self['data'][dset][name].get('max_len')
+        if srs is not None:
+            sr_dir_path = expanduser(self['data'][dset][name]['paths']['srs'])
+            write_images(srs, img_names, sr_dir_path)
+        if gt_keep_masks is not None:
+            mask_dir_path = expanduser(
+                self['data'][dset][name]['paths']['gt_keep_masks'])
+            write_images(gt_keep_masks, img_names, mask_dir_path)
+        gts_path = self['data'][dset][name]['paths'].get('gts')
+        if gts is not None and gts_path:
+            gt_dir_path = expanduser(gts_path)
+            write_images(gts, img_names, gt_dir_path)
+        if lrs is not None:
+            lrs_dir_path = expanduser(
+                self['data'][dset][name]['paths']['lrs'])
+            write_images(lrs, img_names, lrs_dir_path)
+    def get_default_eval_name(self):
+        candidates = self['data']['eval'].keys()
+        if len(candidates) != 1:
+            raise RuntimeError(
+                f"Need exactly one candidate for {self.name}: {candidates}")
+        return list(candidates)[0]
+    def pget(self, name, default=None):
+        if '.' in name:
+            names = name.split('.')
+        else:
+            names = [name]
+        sub_dict = self
+        for name in names:
+            sub_dict = sub_dict.get(name, default)
+            if sub_dict == None:
+                return default
+        return sub_dict

repaint/confs/face_example.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+latex_name: RePaint
+method_name: Repaint
+image_size: 256
+model_path: ./data/pretrained/celeba256_250000.pt
+name: face_example
+inpa_inj_sched_prev: true
+n_jobs: 1
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 5 # for GCDP, 1 for image2layout and 5 for layout2image
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    paper_face_mask:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/gcdp
+      mask_path: ./data/datasets/gt_keep_masks/gcdp
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 1
+      return_dataloader: true
+      offset: 0
+      max_len: 8
+      paths:
+        srs: ./log/face_example/inpainted
+        lrs: ./log/face_example/gt_masked
+        gts: ./log/face_example/gt
+        gt_keep_masks: ./log/face_example/gt_keep_mask

repaint/confs/test_c256_ev2li.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: ./data/pretrained/celeba256_250000.pt
+name: test_c256_ev2li
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_c256_ev2li_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/c256
+      mask_path: ./data/datasets/gt_keep_masks/ev2li
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: fix_ev2li_256
+      max_len: 100
+      paths:
+        srs: ./log/test_c256_ev2li/inpainted
+        lrs: ./log/test_c256_ev2li/gt_masked
+        gts: ./log/test_c256_ev2li/gt
+        gt_keep_masks: ./log/test_c256_ev2li/gt_keep_mask

repaint/confs/test_c256_ex64.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: /cluster/work/cvl/gudiff/guided-diffusion/models/celeba256_diffsteps1000_4gpus/ema_0.9999_250000.pt
+name: test_c256_ex64
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_c256_ex64_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/c256
+      mask_path: ./data/datasets/gt_keep_masks/ex64
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: fix_ex64_256
+      max_len: 100
+      paths:
+        srs: ./log/test_c256_ex64/inpainted
+        lrs: ./log/test_c256_ex64/gt_masked
+        gts: ./log/test_c256_ex64/gt
+        gt_keep_masks: ./log/test_c256_ex64/gt_keep_mask

repaint/confs/test_c256_genhalf.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: /cluster/work/cvl/gudiff/guided-diffusion/models/celeba256_diffsteps1000_4gpus/ema_0.9999_250000.pt
+name: test_c256_genhalf
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_c256_genhalf_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/c256
+      mask_path: ./data/datasets/gt_keep_masks/genhalf
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: fix_genhalf_256
+      max_len: 100
+      paths:
+        srs: ./log/test_c256_genhalf/inpainted
+        lrs: ./log/test_c256_genhalf/gt_masked
+        gts: ./log/test_c256_genhalf/gt
+        gt_keep_masks: ./log/test_c256_genhalf/gt_keep_mask

repaint/confs/test_c256_nn2.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: /cluster/work/cvl/gudiff/guided-diffusion/models/celeba256_diffsteps1000_4gpus/ema_0.9999_250000.pt
+name: test_c256_nn2
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_c256_nn2_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/c256
+      mask_path: ./data/datasets/gt_keep_masks/nn2
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: fix_nn2_256
+      max_len: 100
+      paths:
+        srs: ./log/test_c256_nn2/inpainted
+        lrs: ./log/test_c256_nn2/gt_masked
+        gts: ./log/test_c256_nn2/gt
+        gt_keep_masks: ./log/test_c256_nn2/gt_keep_mask

repaint/confs/test_c256_thick.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: /cluster/work/cvl/gudiff/guided-diffusion/models/celeba256_diffsteps1000_4gpus/ema_0.9999_250000.pt
+name: test_c256_thick
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_c256_thick_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/c256
+      mask_path: ./data/datasets/gt_keep_masks/thick
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_thick_256
+      max_len: 100
+      paths:
+        srs: ./log/test_c256_thick/inpainted
+        lrs: ./log/test_c256_thick/gt_masked
+        gts: ./log/test_c256_thick/gt
+        gt_keep_masks: ./log/test_c256_thick/gt_keep_mask

repaint/confs/test_c256_thin.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: /cluster/work/cvl/gudiff/guided-diffusion/models/celeba256_diffsteps1000_4gpus/ema_0.9999_250000.pt
+name: test_c256_thin
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_c256_thin_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/c256
+      mask_path: ./data/datasets/gt_keep_masks/thin
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_thin_256
+      max_len: 100
+      paths:
+        srs: ./log/test_c256_thin/inpainted
+        lrs: ./log/test_c256_thin/gt_masked
+        gts: ./log/test_c256_thin/gt
+        gt_keep_masks: ./log/test_c256_thin/gt_keep_mask

repaint/confs/test_inet256_ev2li.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: true
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: true
+use_scale_shift_norm: true
+classifier_scale: 1.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+classifier_path: ./data/pretrained/256x256_classifier.pt
+model_path: ./data/pretrained/256x256_diffusion.pt
+name: test_inet256_ev2li
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_inet256_ev2li_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/inet256
+      mask_path: ./data/datasets/gt_keep_masks/ev2li
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_ev2li_256
+      max_len: 100
+      paths:
+        srs: ./log/test_inet256_ev2li/inpainted
+        lrs: ./log/test_inet256_ev2li/gt_masked
+        gts: ./log/test_inet256_ev2li/gt
+        gt_keep_masks: ./log/test_inet256_ev2li/gt_keep_mask

repaint/confs/test_inet256_ex64.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: true
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: true
+use_scale_shift_norm: true
+classifier_scale: 1.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+classifier_path: ./data/pretrained/256x256_classifier.pt
+model_path: ./data/pretrained/256x256_diffusion.pt
+name: test_inet256_ex64
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_inet256_ex64_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/inet256
+      mask_path: ./data/datasets/gt_keep_masks/ex64
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_ex64_256
+      max_len: 100
+      paths:
+        srs: ./log/test_inet256_ex64/inpainted
+        lrs: ./log/test_inet256_ex64/gt_masked
+        gts: ./log/test_inet256_ex64/gt
+        gt_keep_masks: ./log/test_inet256_ex64/gt_keep_mask

repaint/confs/test_inet256_genhalf.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: true
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: true
+use_scale_shift_norm: true
+classifier_scale: 1.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+classifier_path: ./data/pretrained/256x256_classifier.pt
+model_path: ./data/pretrained/256x256_diffusion.pt
+name: test_inet256_genhalf
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_inet256_genhalf_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/inet256
+      mask_path: ./data/datasets/gt_keep_masks/genhalf
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_genhalf_256
+      max_len: 100
+      paths:
+        srs: ./log/test_inet256_genhalf/inpainted
+        lrs: ./log/test_inet256_genhalf/gt_masked
+        gts: ./log/test_inet256_genhalf/gt
+        gt_keep_masks: ./log/test_inet256_genhalf/gt_keep_mask

repaint/confs/test_inet256_nn2.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: true
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: true
+use_scale_shift_norm: true
+classifier_scale: 1.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+classifier_path: ./data/pretrained/256x256_classifier.pt
+model_path: ./data/pretrained/256x256_diffusion.pt
+name: test_inet256_nn2
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_inet256_nn2_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/inet256
+      mask_path: ./data/datasets/gt_keep_masks/nn2
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_nn2_256
+      max_len: 100
+      paths:
+        srs: ./log/test_inet256_nn2/inpainted
+        lrs: ./log/test_inet256_nn2/gt_masked
+        gts: ./log/test_inet256_nn2/gt
+        gt_keep_masks: ./log/test_inet256_nn2/gt_keep_mask

repaint/confs/test_inet256_thick.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: true
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: true
+use_scale_shift_norm: true
+classifier_scale: 1.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+classifier_path: ./data/pretrained/256x256_classifier.pt
+model_path: ./data/pretrained/256x256_diffusion.pt
+name: test_inet256_thick
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_inet256_thick_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/inet256
+      mask_path: ./data/datasets/gt_keep_masks/thick
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_thick_256
+      max_len: 100
+      paths:
+        srs: ./log/test_inet256_thick/inpainted
+        lrs: ./log/test_inet256_thick/gt_masked
+        gts: ./log/test_inet256_thick/gt
+        gt_keep_masks: ./log/test_inet256_thick/gt_keep_mask

repaint/confs/test_inet256_thin.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: true
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: true
+use_scale_shift_norm: true
+classifier_scale: 1.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+classifier_path: ./data/pretrained/256x256_classifier.pt
+model_path: ./data/pretrained/256x256_diffusion.pt
+name: test_inet256_thin
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_inet256_thin_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/inet256
+      mask_path: ./data/datasets/gt_keep_masks/thin
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_thin_256
+      max_len: 100
+      paths:
+        srs: ./log/test_inet256_thin/inpainted
+        lrs: ./log/test_inet256_thin/gt_masked
+        gts: ./log/test_inet256_thin/gt
+        gt_keep_masks: ./log/test_inet256_thin/gt_keep_mask

repaint/confs/test_p256_ev2li.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: ./data/pretrained/places256_300000.pt
+name: test_p256_ev2li
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_p256_ev2li_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/p256
+      mask_path: ./data/datasets/gt_keep_masks/ev2li
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_ev2li_256
+      max_len: 100
+      paths:
+        srs: ./log/test_p256_ev2li/inpainted
+        lrs: ./log/test_p256_ev2li/gt_masked
+        gts: ./log/test_p256_ev2li/gt
+        gt_keep_masks: ./log/test_p256_ev2li/gt_keep_mask

repaint/confs/test_p256_ex64.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: ./data/pretrained/places256_300000.pt
+name: test_p256_ex64
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_p256_ex64_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/p256
+      mask_path: ./data/datasets/gt_keep_masks/ex64
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_ex64_256
+      max_len: 100
+      paths:
+        srs: ./log/test_p256_ex64/inpainted
+        lrs: ./log/test_p256_ex64/gt_masked
+        gts: ./log/test_p256_ex64/gt
+        gt_keep_masks: ./log/test_p256_ex64/gt_keep_mask

repaint/confs/test_p256_genhalf.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: ./data/pretrained/places256_300000.pt
+name: test_p256_genhalf
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_p256_genhalf_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/p256
+      mask_path: ./data/datasets/gt_keep_masks/genhalf
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_genhalf_256
+      max_len: 100
+      paths:
+        srs: ./log/test_p256_genhalf/inpainted
+        lrs: ./log/test_p256_genhalf/gt_masked
+        gts: ./log/test_p256_genhalf/gt
+        gt_keep_masks: ./log/test_p256_genhalf/gt_keep_mask

repaint/confs/test_p256_nn2.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: ./data/pretrained/places256_300000.pt
+name: test_p256_nn2
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_p256_nn2_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/p256
+      mask_path: ./data/datasets/gt_keep_masks/nn2
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_nn2_256
+      max_len: 100
+      paths:
+        srs: ./log/test_p256_nn2/inpainted
+        lrs: ./log/test_p256_nn2/gt_masked
+        gts: ./log/test_p256_nn2/gt
+        gt_keep_masks: ./log/test_p256_nn2/gt_keep_mask

repaint/confs/test_p256_thick.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: ./data/pretrained/places256_300000.pt
+name: test_p256_thick
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_p256_thick_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/p256
+      mask_path: ./data/datasets/gt_keep_masks/thick
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_thick_256
+      max_len: 100
+      paths:
+        srs: ./log/test_p256_thick/inpainted
+        lrs: ./log/test_p256_thick/gt_masked
+        gts: ./log/test_p256_thick/gt
+        gt_keep_masks: ./log/test_p256_thick/gt_keep_mask

repaint/confs/test_p256_thin.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+attention_resolutions: 32,16,8
+class_cond: false
+diffusion_steps: 1000
+learn_sigma: true
+noise_schedule: linear
+num_channels: 256
+num_head_channels: 64
+num_heads: 4
+num_res_blocks: 2
+resblock_updown: true
+use_fp16: false
+use_scale_shift_norm: true
+classifier_scale: 4.0
+lr_kernel_n_std: 2
+num_samples: 100
+show_progress: true
+timestep_respacing: '250'
+use_kl: false
+predict_xstart: false
+rescale_timesteps: false
+rescale_learned_sigmas: false
+classifier_use_fp16: false
+classifier_width: 128
+classifier_depth: 2
+classifier_attention_resolutions: 32,16,8
+classifier_use_scale_shift_norm: true
+classifier_resblock_updown: true
+classifier_pool: attention
+num_heads_upsample: -1
+channel_mult: ''
+dropout: 0.0
+use_checkpoint: false
+use_new_attention_order: false
+clip_denoised: true
+use_ddim: false
+image_size: 256
+model_path: ./data/pretrained/places256_300000.pt
+name: test_p256_thin
+inpa_inj_sched_prev: true
+n_jobs: 25
+print_estimated_vars: true
+inpa_inj_sched_prev_cumnoise: false
+schedule_jump_params:
+  t_T: 250
+  n_sample: 1
+  jump_length: 10
+  jump_n_sample: 10
+data:
+  eval:
+    lama_p256_thin_n100_test:
+      mask_loader: true
+      gt_path: ./data/datasets/gts/p256
+      mask_path: ./data/datasets/gt_keep_masks/thin
+      image_size: 256
+      class_cond: false
+      deterministic: true
+      random_crop: false
+      random_flip: false
+      return_dict: true
+      drop_last: false
+      batch_size: 4
+      return_dataloader: true
+      ds_conf:
+        name: random_thin_256
+      max_len: 100
+      paths:
+        srs: ./log/test_p256_thin/inpainted
+        lrs: ./log/test_p256_thin/gt_masked
+        gts: ./log/test_p256_thin/gt
+        gt_keep_masks: ./log/test_p256_thin/gt_keep_mask

repaint/download.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+(
+mkdir -p data/pretrained
+cd data/pretrained
+wget https://openaipublic.blob.core.windows.net/diffusion/jul-2021/256x256_classifier.pt # Trained by OpenAI
+wget https://openaipublic.blob.core.windows.net/diffusion/jul-2021/256x256_diffusion.pt # Trained by OpenAI
+gdown https://drive.google.com/uc?id=1norNWWGYP3EZ_o05DmoW1ryKuKMmhlCX
+gdown https://drive.google.com/uc?id=1QEl-btGbzQz6IwkXiFGd49uQNTUtTHsk
+)
+# data
+(
+gdown https://drive.google.com/uc?id=1Q_dxuyI41AAmSv9ti3780BwaJQqwvwMv
+unzip data.zip
+rm data.zip
+)

repaint/guided_diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+"""
+Based on "Improved Denoising Diffusion Probabilistic Models".
+"""

repaint/guided_diffusion/dist_util.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) 2022 Huawei Technologies Co., Ltd.
+# Licensed under CC BY-NC-SA 4.0 (Attribution-NonCommercial-ShareAlike 4.0 International) (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+#
+# The code is released for academic research use only. For commercial use, please contact Huawei Technologies Co., Ltd.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This repository was forked from https://github.com/openai/guided-diffusion, which is under the MIT license
+"""
+Helpers for distributed training.
+"""
+import io
+import blobfile as bf
+import torch as th
+def dev(device):
+    """
+    Get the device to use for torch.distributed.
+    """
+    if device is None:
+        if th.cuda.is_available():
+            return th.device(f"cuda")
+        return th.device("cpu")
+    return th.device(device)
+def load_state_dict(path, backend=None, **kwargs):
+    with bf.BlobFile(path, "rb") as f:
+        data = f.read()
+    return th.load(io.BytesIO(data), **kwargs)