Spaces:

NTUST-DDRC
/

VTON360

Paused

App Files Files Community

weiyuyeh commited on Jul 29, 2025

Commit

a03472d

1 Parent(s): e41b2aa

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +57 -0
README.md +2 -2
app.py +152 -0
requirements.txt +51 -0
src/multiview_consist_edit/MVHumanNet_multi.py +403 -0
src/multiview_consist_edit/Thuman2_multi.py +366 -0
src/multiview_consist_edit/config/infer_tryon_multi.yaml +44 -0
src/multiview_consist_edit/config/train_tryon_multi.yaml +137 -0
src/multiview_consist_edit/data/MVHumanNet_multi.py +406 -0
src/multiview_consist_edit/data/Thuman2_multi.py +367 -0
src/multiview_consist_edit/data/camera_utils.py +479 -0
src/multiview_consist_edit/infer_tryon_multi.py +185 -0
src/multiview_consist_edit/models/ReferenceEncoder.py +67 -0
src/multiview_consist_edit/models/ReferenceNet.py +1146 -0
src/multiview_consist_edit/models/ReferenceNet_attention_multi_fp16.py +297 -0
src/multiview_consist_edit/models/attention.py +320 -0
src/multiview_consist_edit/models/condition_encoder.py +395 -0
src/multiview_consist_edit/models/embeddings.py +385 -0
src/multiview_consist_edit/models/hack_poseguider.py +97 -0
src/multiview_consist_edit/models/hack_unet2d.py +329 -0
src/multiview_consist_edit/models/mv_attn_processor.py +132 -0
src/multiview_consist_edit/models/resnet.py +212 -0
src/multiview_consist_edit/models/unet.py +523 -0
src/multiview_consist_edit/parse_tool/postprocess_parse.py +42 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/datasets/__init__.py +0 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/datasets/datasets.py +201 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/datasets/simple_extractor_dataset.py +89 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/datasets/target_generation.py +40 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/__init__.py +5 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/bn.py +132 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/deeplab.py +84 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/dense.py +42 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/functions.py +245 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/misc.py +21 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/residual.py +182 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/checks.h +15 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn.cpp +95 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn.h +88 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp +119 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu +333 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu +275 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/utils/checks.h +15 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/utils/common.h +49 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/utils/cuda.cuh +71 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/AugmentCE2P.py +388 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/__init__.py +12 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/backbone/mobilenetv2.py +156 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/backbone/resnet.py +205 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/backbone/resnext.py +149 -0
src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/context_encoding/aspp.py +64 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,57 @@

+# Python 编译文件和缓存
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.so
+# Python 虚拟环境
+venv/
+.env/
+.venv/
+env/
+virtualenvs/
+.Python/
+# Python 打包和分发
+build/
+dist/
+*.egg-info/
+*.egg
+*.whl
+*.tar.gz
+# 测试相关
+.coverage
+htmlcov/
+.pytest_cache/
+.mypy_cache/
+# IDE 和编辑器
+.idea/
+.vscode/
+*.suo
+*.sublime-workspace
+*.sublime-project
+# 环境变量文件
+.env
+.env.local
+.env.*
+# 日志文件
+*.log
+*.log.*
+# 系统文件
+.DS_Store
+Thumbs.db
+src/render_from_thuman/ckpt/
+# data
+demo_data/
+# models
+src/multiview_consist_edit/checkpoints/
+src/multiview_consist_edit/parse_tool/ckpt/

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 title: VTON360
-emoji: ⚡
-colorFrom: blue
 colorTo: yellow
 sdk: gradio
 sdk_version: 5.38.2

 ---
 title: VTON360
+emoji: 🐢
+colorFrom: purple
 colorTo: yellow
 sdk: gradio
 sdk_version: 5.38.2

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import gradio as gr
+import subprocess
+import os
+import shutil
+import sys
+target_paths = {
+    "data": "/home/user/app/upload/data.zip",
+    "data_dir": "/home/user/app/upload/data",
+    "config": "/home/user/app/src/multiview_consist_edit/config/infer_tryon_multi.yaml",
+    "output_data": "/home/user/app/image_output_tryon_mvhumannet",
+    "output_zip": "/home/user/app/outputs/result.zip",
+}
+def unzip_data():
+    if os.path.exists(target_paths["data"]):
+        if os.path.exists(target_paths["data_dir"]):
+            shutil.rmtree(target_paths["data_dir"])
+        os.makedirs(target_paths["data_dir"], exist_ok=True)
+        shutil.unpack_archive(target_paths["data"], target_paths["data_dir"])
+        return target_paths["data_dir"]
+    else:
+        raise FileNotFoundError("Data file not found at " + target_paths["data"])
+def zip_outputs():
+    if os.path.exists(target_paths["output_zip"]):
+        os.remove(target_paths["output_zip"])
+    shutil.make_archive(target_paths["output_zip"].replace(".zip", ""), 'zip', root_dir=target_paths["output_data"])
+    return target_paths["output_zip"]
+def start_inference_stream():
+    process = subprocess.Popen(
+        ["python", "src/multiview_consist_edit/infer_tryon_multi.py"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        universal_newlines=True
+    )
+    output = []
+    for line in process.stdout:
+        output.append(line)
+        yield "".join(output)
+def install_package(package_name):
+    try:
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "install", package_name],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        output = result.stdout + "\n" + result.stderr
+        return output
+    except Exception as e:
+        return f"Error: {str(e)}"
+def show_package(pkg_name):
+    try:
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "show", pkg_name],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        return result.stdout if result.stdout else result.stderr
+    except Exception as e:
+        return str(e)
+def uninstall_package(package_name):
+    try:
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "uninstall", package_name, "-y"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        output = result.stdout + "\n" + result.stderr
+        return output
+    except Exception as e:
+        return f"Error: {str(e)}"
+# print(uninstall_package("datasets"))
+# print(install_package("uvicorn==0.30.6"))
+# print(install_package("huggingface_hub==0.25.1"))
+# print(install_package("diffusers==0.25.1"))
+# print(install_package("gradio==5.0.0"))
+# print("package version set complete")
+def save_files(data_file, config_file):
+    os.makedirs(os.path.dirname(target_paths["data"]), exist_ok=True)
+    os.makedirs(os.path.dirname(target_paths["config"]), exist_ok=True)
+    shutil.copy(data_file.name, target_paths["data"])
+    shutil.copy(config_file.name, target_paths["config"])
+    unzip_data()
+    return "檔案已成功上傳、儲存並解壓縮了！"
+with gr.Blocks(theme=gr.themes.Origin()) as demo:
+    gr.Markdown("## 請先上傳檔案")
+    with gr.Row():
+        data_input = gr.File(label="上傳資料壓縮檔", file_types=[".zip"])
+        config_input = gr.File(label="Config 檔", file_types=[".yaml", ".yml"])
+    upload_button = gr.Button("上傳並儲存")
+    output = gr.Textbox(label="狀態")
+    gr.Markdown("## Inference")
+    with gr.Column():
+        log_output = gr.Textbox(label="Inference Log", lines=20)
+        infer_btn = gr.Button("Start Inference")
+    gr.Markdown("## Pip Installer")
+    with gr.Column():
+        with gr.Row():
+            pkg_input = gr.Textbox(lines=1, placeholder="輸入想安裝的套件名稱，例如 diffusers 或 numpy==1.2.0")
+            install_output = gr.Textbox(label="Install Output", lines=10)
+        install_btn = gr.Button("Install Package")
+    gr.Markdown("## Pip Uninstaller")
+    with gr.Column():
+        with gr.Row():
+            pkg_input2 = gr.Textbox(lines=1, placeholder="輸入想解除安裝的套件名稱，例如 diffusers 或 numpy")
+            uninstall_output = gr.Textbox(label="Uninstall Output", lines=10)
+        uninstall_btn = gr.Button("Uninstall Package")
+    gr.Markdown("## Pip show")
+    with gr.Column():
+        with gr.Row():
+            show_input = gr.Textbox(label="輸入套件名稱（如 diffusers）")
+            show_output = gr.Textbox(label="套件資訊", lines=10)
+        show_btn = gr.Button("pip show")
+    gr.Markdown("## Download results")
+    with gr.Column():
+        file_output = gr.File(label="點擊下載", interactive=True)
+        download_btn = gr.Button("下載結果")
+    show_btn.click(fn=show_package, inputs=show_input, outputs=show_output)
+    download_btn.click(fn=zip_outputs, outputs=file_output)
+    install_btn.click(fn=install_package, inputs=pkg_input, outputs=install_output)
+    infer_btn.click(fn=start_inference_stream, outputs=log_output)
+    uninstall_btn.click(fn=uninstall_package, inputs=pkg_input2, outputs=uninstall_output)
+    upload_button.click(fn=save_files,inputs=[data_input, config_input],outputs=output)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,51 @@

+--extra-index-url https://download.pytorch.org/whl/cu118
+accelerate==0.25.0
+av==12.3.0
+basicsr==1.4.2
+black==25.1.0
+cityscapesscripts==2.2.4
+cloudpickle==3.1.1
+diffusers==0.25.1
+einops==0.8.1
+fairscale==0.4.13
+fvcore==0.1.5.post20221221
+gsplat==0.1.2.1
+hydra-core==1.3.2
+iopath==0.1.10
+kornia==0.7.3
+matplotlib==3.10.3
+mmcv==2.2.0
+mmdet==3.3.0
+nerfstudio==1.0.0
+numpy==1.24.4
+omegaconf==2.3.0
+onnx==1.17.0
+onnxruntime==1.16.2
+open_clip_torch==2.22.0
+opencv_python==4.8.0.76
+packaging==25.0
+Pillow==11.2.1
+pycocotools==2.0.8
+Pygments==2.19.1
+pytorch_msssim==1.0.0
+PyYAML==6.0.1
+Requests==2.32.3
+safetensors==0.5.3
+scikit_learn==1.6.1
+scipy==1.15.3
+setuptools==69.5.1
+Shapely==2.1.0
+scikit-image
+tabulate==0.9.0
+taichi==1.7.3
+taichi_glsl==0.0.12
+termcolor==3.1.0
+timm
+torch==2.1.2+cu118
+torchvision==0.16.2+cu118
+torchmetrics==1.7.1
+tqdm==4.66.4
+transformers==4.42.3
+typing_extensions==4.13.2
+xformers==0.0.23.post1

src/multiview_consist_edit/MVHumanNet_multi.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import os, io, csv, math, random
+import numpy as np
+from PIL import Image,ImageDraw
+import json
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data.dataset import Dataset
+from transformers import CLIPProcessor
+import random
+from torchvision.transforms import functional as F
+import torch.distributed as dist
+import copy
+import cv2
+import pickle
+from .camera_utils import read_camera_mvhumannet
+def crop_and_resize(img, bbox, size):
+    # 计算中心点和新的宽高
+    center_x = (bbox[0] + bbox[2]) / 2
+    center_y = (bbox[1] + bbox[3]) / 2
+    new_height = bbox[3] - bbox[1]
+    new_width = int(new_height * (2 / 3))
+    # 计算新的边界框
+    new_bbox = [
+        int(center_x - new_width / 2),
+        int(center_y - new_height / 2),
+        int(center_x + new_width / 2),
+        int(center_y + new_height / 2)
+    ]
+    # 裁剪图像
+    cropped_img = img.crop(new_bbox)
+    # 调整大小
+    resized_img = cropped_img.resize(size)
+    return resized_img
+class MVHumanNet_Dataset(Dataset):
+    def __init__(
+        self, dataroot, sample_size=(512,384), is_train=True, mode='pair', clip_model_path='', multi_length=8,
+    ):
+        im_names = []
+        self.dataroot = os.path.join(dataroot, 'processed_mvhumannet')
+        self.cloth_root = os.path.join(dataroot, 'cloth')
+        self.data_ids = []
+        self.data_frame_ids = []
+        self.cloth_ids = []
+        self.cloth_frame_ids = []
+        if is_train:
+            f = open(os.path.join(dataroot,'train_frame_ids.txt'))
+            for line in f.readlines():
+                line_info = line.strip().split()
+                self.data_ids.append(line_info[0])
+                self.data_frame_ids.append(line_info[1])
+            f.close()
+        else:
+            f = open(os.path.join(dataroot, 'test_ids.txt'))
+            for line in f.readlines():
+                line_info = line.strip().split()
+                self.data_ids.append(line_info[0])
+                self.data_frame_ids.append(line_info[1])
+            f.close()
+            f2 = open(os.path.join(dataroot, 'test_cloth_ids.txt'))
+            # f2 = open(os.path.join(dataroot, 'test_mvg_cloth_ids.txt'))
+            for line in f2.readlines():
+                line_info = line.strip().split()
+                self.cloth_ids.append(line_info[0])
+                self.cloth_frame_ids.append(line_info[1])
+            f2.close()
+        self.is_train = is_train
+        self.sample_size = sample_size
+        self.multi_length = multi_length
+        self.clip_image_processor = CLIPProcessor.from_pretrained(clip_model_path,local_files_only=True)
+        self.pixel_transforms = transforms.Compose([
+            #transforms.Resize((1024,768), interpolation=0),
+            #transforms.CenterCrop((int(1024 * 6/8), int(768 * 6/8))),
+            transforms.Resize(self.sample_size, interpolation=0),
+            # transforms.CenterCrop(self.sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.pixel_transforms_0 = transforms.Compose([
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.pixel_transforms_1 = transforms.Compose([
+            # transforms.Resize((1024,768), interpolation=0),
+            # transforms.CenterCrop((int(1024 * 6/8), int(768 * 6/8))),
+            transforms.Resize(self.sample_size, interpolation=0),
+        ])
+        self.ref_transforms_train = transforms.Compose([
+            transforms.Resize(self.sample_size),
+            # RandomScaleResize([1.0,1.1]),
+            # transforms.CenterCrop(self.sample_size),
+            transforms.RandomAffine(degrees=0, translate=(0.08,0.08),scale=(0.9,1.1)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.ref_transforms_test = transforms.Compose([
+            transforms.Resize(self.sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    def __len__(self):
+        if len(self.cloth_ids) >= 1:
+            return len(self.data_ids)*len(self.cloth_ids)
+        else:
+            return len(self.data_ids)
+    def __getitem__(self, idx):
+        if len(self.cloth_ids) >=1:
+            data_idx = idx // len(self.cloth_ids)
+            cloth_idx = idx % len(self.cloth_ids)
+            data_id = self.data_ids[data_idx]
+            frame_id = self.data_frame_ids[data_idx]
+            cloth_id = self.cloth_ids[cloth_idx]
+            cloth_frame_id = self.cloth_frame_ids[cloth_idx]
+            cloth_name_front = os.path.join(self.cloth_root, '%s_%s_front.jpg' % (cloth_id, cloth_frame_id))  # 实际是反的
+            cloth_name_back = os.path.join(self.cloth_root, '%s_%s_back.jpg' % (cloth_id, cloth_frame_id))
+        else:
+            data_id = self.data_ids[idx]
+            frame_id = self.data_frame_ids[idx]
+            cloth_name_front = os.path.join(self.cloth_root, '%s_%s_front.jpg' % (data_id, frame_id))  # 实际是反的
+            cloth_name_back = os.path.join(self.cloth_root, '%s_%s_back.jpg' % (data_id, frame_id))
+        # cloth_name_front = os.path.join(self.cloth_root, '%s_%s_front.jpg' % ('100030', '0540'))
+        # cloth_name_back = os.path.join(self.cloth_root, '%s_%s_back.jpg' % ('100030', '0540'))
+        images_root = os.path.join(self.dataroot, data_id, 'agnostic', frame_id)
+        images = sorted(os.listdir(images_root))
+        if self.is_train:
+            check_images = []
+            for image in images:
+                if 'CC32871A015' not in image:
+                    check_images.append(image)
+            select_images = random.sample(check_images, self.multi_length)
+        else:
+            # front
+            front_cameras = [
+                'CC32871A005','CC32871A016','CC32871A017','CC32871A023','CC32871A027',
+                'CC32871A030','CC32871A032','CC32871A033','CC32871A034','CC32871A035',
+                'CC32871A038','CC32871A050','CC32871A051','CC32871A052','CC32871A059', 'CC32871A060'
+            ]
+            back_cameras = [
+                'CC32871A004','CC32871A010', 'CC32871A013', 'CC32871A022', 'CC32871A029',
+                'CC32871A031','CC32871A037', 'CC32871A039', 'CC32871A040', 'CC32871A044',
+                'CC32871A046','CC32871A048', 'CC32871A055', 'CC32871A057', 'CC32871A058', 'CC32871A041'
+            ]
+            select_images = []
+            for image in images:
+                camera_id = image.split('_')[0]
+                if camera_id in front_cameras:
+                    select_images.append(image)
+        select_images = sorted(select_images)
+        # print(select_images)
+        for i in range(len(select_images)):
+            select_images[i] = os.path.join(data_id,'resized_img', frame_id, select_images[i])
+        sample = self.load_images(select_images, data_id, cloth_name_front, cloth_name_back)
+        return sample
+    def load_images(self, select_images, data_id, cloth_name_front, cloth_name_back):
+        pixel_values_list = []
+        pixel_values_pose_list = []
+        camera_parm_list = []
+        pixel_values_agnostic_list = []
+        image_name_list = []
+        # load camera info
+        intri_name = os.path.join(self.dataroot, data_id, 'camera_intrinsics.json')
+        extri_name = os.path.join(self.dataroot, data_id, 'camera_extrinsics.json')
+        camera_scale_fn = os.path.join(self.dataroot, data_id, 'camera_scale.pkl')
+        camera_scale = pickle.load(open(camera_scale_fn, "rb"))
+        cameras_gt = read_camera_mvhumannet(intri_name, extri_name, camera_scale)
+        # load person data
+        for img_name in select_images:
+            camera_id = img_name.split('/')[-1].split('_')[0]
+            # load data
+            image_name_list.append(img_name)
+            pixel_values = Image.open(os.path.join(self.dataroot, img_name))
+            pixel_values_pose = Image.open(os.path.join(self.dataroot, img_name).replace('resized_img', 'normals').replace('.jpg','_normal.jpg'))
+            pixel_values_agnostic = Image.open(os.path.join(self.dataroot, img_name).replace('resized_img', 'agnostic'))
+            parm_matrix = cameras_gt[camera_id]['RT']  # extrinsic
+            # crop pose
+            annot_path = os.path.join(self.dataroot, img_name.replace('resized_img', 'annots').replace('.jpg','.json'))
+            annot_info = json.load(open(annot_path))
+            bbox = annot_info['annots'][0]['bbox']
+            width = annot_info['width']
+            if width == 4096 or width == 2448:
+                for i in range(4):
+                    bbox[i] = bbox[i] // 2
+            elif width == 2048:
+                pass
+            else:
+                print('wrong annot size',img_path)
+            pixel_values_pose = crop_and_resize(pixel_values_pose, bbox, size=self.sample_size)
+            # camera parameter
+            parm_matrix = torch.tensor(parm_matrix)
+            camera_parm = parm_matrix[:3,:3].reshape(-1) # todo
+            # transform
+            pixel_values = self.pixel_transforms(pixel_values)
+            pixel_values_pose = self.pixel_transforms(pixel_values_pose)
+            pixel_values_agnostic = self.pixel_transforms(pixel_values_agnostic)
+            pixel_values_list.append(pixel_values)
+            pixel_values_pose_list.append(pixel_values_pose)
+            camera_parm_list.append(camera_parm)
+            pixel_values_agnostic_list.append(pixel_values_agnostic)
+        pixel_values = torch.stack(pixel_values_list)
+        pixel_values_pose = torch.stack(pixel_values_pose_list)
+        camera_parm = torch.stack(camera_parm_list)
+        pixel_values_agnostic = torch.stack(pixel_values_agnostic_list)
+        pixel_values_cloth_front = Image.open(os.path.join(self.cloth_root, cloth_name_front))
+        pixel_values_cloth_back = Image.open(os.path.join(self.cloth_root, cloth_name_back))
+        # clip
+        clip_ref_front = self.clip_image_processor(images=pixel_values_cloth_front, return_tensors="pt").pixel_values
+        clip_ref_back = self.clip_image_processor(images=pixel_values_cloth_back, return_tensors="pt").pixel_values
+        if self.is_train:
+            pixel_values_cloth_front = self.ref_transforms_train(pixel_values_cloth_front)
+            pixel_values_cloth_back = self.ref_transforms_train(pixel_values_cloth_back)
+        else:
+            pixel_values_cloth_front = self.ref_transforms_test(pixel_values_cloth_front)
+            pixel_values_cloth_back = self.ref_transforms_test(pixel_values_cloth_back)
+        drop_image_embeds = []
+        for k in range(len(select_images)):
+            if random.random() < 0.1:
+                drop_image_embeds.append(torch.tensor(1))
+            else:
+                drop_image_embeds.append(torch.tensor(0))
+        drop_image_embeds = torch.stack(drop_image_embeds)
+        sample = dict(
+            pixel_values=pixel_values,
+            pixel_values_pose=pixel_values_pose,
+            pixel_values_agnostic=pixel_values_agnostic,
+            clip_ref_front=clip_ref_front,
+            clip_ref_back=clip_ref_back,
+            pixel_values_cloth_front=pixel_values_cloth_front,
+            pixel_values_cloth_back=pixel_values_cloth_back,
+            camera_parm=camera_parm,
+            drop_image_embeds=drop_image_embeds,
+            img_name=image_name_list,
+            cloth_name=cloth_name_front,
+            )
+        return sample
+def collate_fn(data):
+    pixel_values = torch.stack([example["pixel_values"] for example in data])
+    pixel_values_pose = torch.stack([example["pixel_values_pose"] for example in data])
+    pixel_values_agnostic = torch.stack([example["pixel_values_agnostic"] for example in data])
+    clip_ref_front = torch.cat([example["clip_ref_front"] for example in data])
+    clip_ref_back = torch.cat([example["clip_ref_back"] for example in data])
+    pixel_values_cloth_front = torch.stack([example["pixel_values_cloth_front"] for example in data])
+    pixel_values_cloth_back = torch.stack([example["pixel_values_cloth_back"] for example in data])
+    camera_parm = torch.stack([example["camera_parm"] for example in data])
+    drop_image_embeds = [example["drop_image_embeds"] for example in data]
+    drop_image_embeds = torch.stack(drop_image_embeds)
+    img_name = []
+    cloth_name = []
+    for example in data:
+        img_name.extend(example['img_name'])
+        cloth_name.append(example['cloth_name'])
+    return {
+        "pixel_values": pixel_values,
+        "pixel_values_pose": pixel_values_pose,
+        "pixel_values_agnostic": pixel_values_agnostic,
+        "clip_ref_front": clip_ref_front,
+        "clip_ref_back": clip_ref_back,
+        "pixel_values_ref_front": pixel_values_cloth_front,
+        "pixel_values_ref_back": pixel_values_cloth_back,
+        "camera_parm": camera_parm,
+        "drop_image_embeds": drop_image_embeds,
+        "img_name": img_name,
+        "cloth_name": cloth_name,
+    }
+if __name__ == '__main__':
+    seed = 20
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    dataset = MVHumanNet_Dataset(dataroot="/GPUFS/sysu_gbli2_1/hzj/mvhumannet/",
+        sample_size=(768,576),is_train=True,mode='pair',
+        clip_model_path = "/GPUFS/sysu_gbli2_1/hzj/pretrained_models/clip-vit-base-patch32")
+    # print(len(dataset))
+    # for _ in range(500):
+    #     p = random.randint(0,len(dataset)-1)
+    #     p = dataset[p]
+    test_dataloader = torch.utils.data.DataLoader(
+        dataset,
+        shuffle=False,
+        collate_fn=collate_fn,
+        batch_size=1,
+        num_workers=2,
+    )
+    for _, batch in enumerate(test_dataloader):
+        # print(batch['cloth_name'], batch['img_name'])
+        p = {}
+        print('111', batch['camera_parm'].shape)
+        print('111', batch['drop_image_embeds'].shape)
+        for key in batch.keys():
+            p[key] = batch[key][0]
+        # p = dataset[12]
+        print(p['camera_parm'].shape)
+        pixel_values = p['pixel_values'][0].permute(1,2,0).numpy()
+        print(p['pixel_values'].shape)
+        pixel_values = pixel_values / 2 + 0.5
+        pixel_values *=255
+        pixel_values = pixel_values.astype(np.uint8)
+        pixel_values= Image.fromarray(pixel_values)
+        pixel_values.save('pixel_values0.jpg')
+        pixel_values_pose = p['pixel_values_pose'][0].permute(1,2,0).numpy()
+        print(p['pixel_values_pose'].shape)
+        pixel_values_pose = pixel_values_pose / 2 + 0.5
+        pixel_values_pose *=255
+        pixel_values_pose = pixel_values_pose.astype(np.uint8)
+        pixel_values_pose= Image.fromarray(pixel_values_pose)
+        pixel_values_pose.save('pixel_values_pose.jpg')
+        pixel_values_agnostic = p['pixel_values_agnostic'][0].permute(1,2,0).numpy()
+        print(p['pixel_values_agnostic'].shape)
+        pixel_values_agnostic = pixel_values_agnostic / 2 + 0.5
+        pixel_values_agnostic *=255
+        pixel_values_agnostic = pixel_values_agnostic.astype(np.uint8)
+        pixel_values_agnostic= Image.fromarray(pixel_values_agnostic)
+        pixel_values_agnostic.save('pixel_values_agnostic.jpg')
+        pixel_values = p['pixel_values'][2].permute(1,2,0).numpy()
+        print(p['pixel_values'].shape)
+        pixel_values = pixel_values / 2 + 0.5
+        pixel_values *=255
+        pixel_values = pixel_values.astype(np.uint8)
+        pixel_values= Image.fromarray(pixel_values)
+        pixel_values.save('pixel_values2.jpg')
+        pixel_values_pose = p['pixel_values_pose'][2].permute(1,2,0).numpy()
+        print(p['pixel_values_pose'].shape)
+        pixel_values_pose = pixel_values_pose / 2 + 0.5
+        pixel_values_pose *=255
+        pixel_values_pose = pixel_values_pose.astype(np.uint8)
+        pixel_values_pose= Image.fromarray(pixel_values_pose)
+        pixel_values_pose.save('pixel_values_pose2.jpg')
+        pixel_values_agnostic = p['pixel_values_agnostic'][2].permute(1,2,0).numpy()
+        print(p['pixel_values_agnostic'].shape)
+        pixel_values_agnostic = pixel_values_agnostic / 2 + 0.5
+        pixel_values_agnostic *=255
+        pixel_values_agnostic = pixel_values_agnostic.astype(np.uint8)
+        pixel_values_agnostic= Image.fromarray(pixel_values_agnostic)
+        pixel_values_agnostic.save('pixel_values_agnostic2.jpg')
+        pixel_values_cloth_img = p['pixel_values_ref_front'].permute(1,2,0).numpy()
+        print(p['pixel_values_ref_front'].shape)
+        pixel_values_cloth_img = pixel_values_cloth_img / 2 + 0.5
+        pixel_values_cloth_img *=255
+        pixel_values_cloth_img = pixel_values_cloth_img.astype(np.uint8)
+        pixel_values_cloth_img= Image.fromarray(pixel_values_cloth_img)
+        pixel_values_cloth_img.save('pixel_values_cloth_front.jpg')
+        pixel_values_cloth_img = p['pixel_values_ref_back'].permute(1,2,0).numpy()
+        print(p['pixel_values_ref_back'].shape)
+        pixel_values_cloth_img = pixel_values_cloth_img / 2 + 0.5
+        pixel_values_cloth_img *=255
+        pixel_values_cloth_img = pixel_values_cloth_img.astype(np.uint8)
+        pixel_values_cloth_img= Image.fromarray(pixel_values_cloth_img)
+        pixel_values_cloth_img.save('pixel_values_cloth_back.jpg')
+        exit()

src/multiview_consist_edit/Thuman2_multi.py ADDED Viewed

	@@ -0,0 +1,366 @@

+import os, io, csv, math, random
+import numpy as np
+from PIL import Image,ImageDraw
+import json
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data.dataset import Dataset
+from transformers import CLIPProcessor
+import random
+from torchvision.transforms import functional as F
+import torch.distributed as dist
+import copy
+import cv2
+def crop_image(human_img_orig):
+    human_img_orig = human_img_orig.resize((1024,1024))
+    original_width, original_height = human_img_orig.size
+    target_width = 768
+    crop_amount = (original_width - target_width) // 2
+    left = crop_amount
+    upper = 0
+    right = original_width - crop_amount
+    lower = original_height
+    cropped_image = human_img_orig.crop((left, upper, right, lower))
+    return cropped_image
+class Thuman2_Dataset(Dataset):
+    def __init__(
+        self, dataroot, sample_size=(512,384), is_train=True, mode='pair', clip_model_path='', multi_length=8,
+    ):
+        c_names_front = []
+        c_names_back = []
+        self.data_ids = []
+        self.dataroot = os.path.join(dataroot, 'all')
+        self.cloth_root = os.path.join(dataroot, 'cloth')
+        # self.cloth_root = os.path.join(dataroot, 'MVG_clothes')
+        self.cloth_ids = []
+        if is_train:
+            f = open(os.path.join(dataroot,'train_ids.txt'))
+            for line in f.readlines():
+                self.data_ids.append(line.strip())
+            f.close()
+        else:
+            # f = open(os.path.join(dataroot, 'val_ids.txt'))
+            f = open(os.path.join(dataroot, 'test_ids.txt'))
+            # f = open(os.path.join(dataroot, 'test_mvg_ids.txt'))
+            for line in f.readlines():
+                self.data_ids.append(line.strip())
+            f.close()
+            f2 = open(os.path.join(dataroot, 'test_cloth_ids.txt'))
+            # f2 = open(os.path.join(dataroot, 'test_mvg_cloth_ids.txt'))
+            for line in f2.readlines():
+                self.cloth_ids.append(line.strip())
+            f2.close()
+        self.mode = mode
+        self.is_train = is_train
+        self.sample_size = sample_size
+        self.multi_length = multi_length
+        self.clip_image_processor = CLIPProcessor.from_pretrained(clip_model_path,local_files_only=True)
+        self.pixel_transforms = transforms.Compose([
+            transforms.Resize((1024,768), interpolation=0),
+            transforms.CenterCrop((int(1024 * 6/8), int(768 * 6/8))),
+            transforms.Resize(self.sample_size, interpolation=0),
+            # transforms.CenterCrop(self.sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.pixel_transforms_0 = transforms.Compose([
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.pixel_transforms_1 = transforms.Compose([
+            transforms.Resize((1024,768), interpolation=0),
+            transforms.CenterCrop((int(1024 * 6/8), int(768 * 6/8))),
+            transforms.Resize(self.sample_size, interpolation=0),
+        ])
+        self.ref_transforms_train = transforms.Compose([
+            transforms.Resize(self.sample_size),
+            # RandomScaleResize([1.0,1.1]),
+            transforms.CenterCrop(self.sample_size),
+            transforms.RandomAffine(degrees=0, translate=(0.08,0.08),scale=(0.9,1.1)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.ref_transforms_test = transforms.Compose([
+            transforms.Resize(self.sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.color_transform = transforms.ColorJitter(brightness=0.3, contrast=0.2, saturation=0.2, hue=0.0)
+    def __len__(self):
+        if len(self.cloth_ids) >= 1:
+            return len(self.data_ids)*len(self.cloth_ids)
+        else:
+            return len(self.data_ids)
+    def __getitem__(self, idx):
+        if len(self.cloth_ids) >=1:
+            data_idx = idx // len(self.cloth_ids)
+            cloth_idx = idx % len(self.cloth_ids)
+            data_id = self.data_ids[data_idx]
+            cloth_id = self.cloth_ids[cloth_idx]
+            cloth_name_back = os.path.join(self.cloth_root, '%s_front.jpg' % cloth_id)
+            cloth_name_front =  os.path.join(self.cloth_root, '%s_back.jpg' % cloth_id)
+        else:
+            data_id = self.data_ids[idx]
+            cloth_name_back = os.path.join(self.cloth_root, '%s_front.jpg' % data_id)
+            cloth_name_front =  os.path.join(self.cloth_root, '%s_back.jpg' % data_id)
+        images_root = os.path.join(self.dataroot, data_id, 'agnostic') # need only val
+        images = sorted(os.listdir(images_root))
+        # cloth_name_back = '0001_front.jpg'
+        # cloth_name_front = '0001_back.jpg'
+        if self.is_train:
+            select_images = random.sample(images, self.multi_length)
+        else:
+            # select_idxs = [0,3,6,9,12, 15,18,21,24,27, 79,76,73,70,67,64]
+            L = len(images)
+            select_idxs = []
+            begin = 0
+            sl = 16.0
+            if True:
+                while begin < L//2:
+                    select_idxs.append(int(begin/2))
+                    select_idxs.append(int(L-1-begin/2))
+                    begin += L/sl
+            else:
+                begin = L//4
+                while begin < L*3//4:
+                    select_idxs.append(int(begin))
+                    begin += L/2/sl
+            # print(sorted(select_idxs))
+            # select_idxs = [0,3,6,9,12, 15,18,21,24,27, L-1,L-4,L-7,L-10,L-13,L-16]
+            select_images = []
+            for select_idx in select_idxs:
+                select_images.append(images[select_idx])
+        select_images = sorted(select_images)
+        # print(select_images)
+        for i in range(len(select_images)):
+            select_images[i] = os.path.join(data_id,'images',select_images[i])
+        sample = self.load_images(select_images, cloth_name_front, cloth_name_back)
+        return sample
+    def color_progress(images):
+        fn_idx, b, c, s, h = self.color_transform.get_params(color_jitter.brightness, color_jitter.contrast, color_jitter.saturation,color_jitter.hue)
+        for image in images:
+            image = F.adjust_contrast(image, c)
+            image = F.adjust_brightness(image, b)
+            image = F.adjust_saturation(image, s)
+        return images
+    def load_images(self, select_images, cloth_name_front, cloth_name_back):
+        pixel_values_list = []
+        pixel_values_pose_list = []
+        camera_parm_list = []
+        pixel_values_agnostic_list = []
+        image_name_list = []
+        # load person data
+        for img_name in select_images:
+            image_name_list.append(img_name)
+            pixel_values = Image.open(os.path.join(self.dataroot, img_name))
+            pixel_values_pose = Image.open(os.path.join(self.dataroot, img_name).replace('images', 'normals'))
+            # parse_lip = Image.open(os.path.join(parse_lip_dir, img_name))
+            pixel_values_agnostic = Image.open(os.path.join(self.dataroot, img_name).replace('images', 'agnostic'))
+            parm_matrix = np.load(os.path.join(self.dataroot, img_name[:4],'parm', img_name[-7:-4]+'_extrinsic.npy'))
+            pixel_values = crop_image(pixel_values)
+            pixel_values_pose = crop_image(pixel_values_pose)
+            # camera parameter
+            parm_matrix = torch.tensor(parm_matrix)
+            camera_parm = parm_matrix[:3,:3].reshape(-1) # todo
+            # transform
+            pixel_values = self.pixel_transforms(pixel_values)
+            pixel_values_pose = self.pixel_transforms(pixel_values_pose)
+            pixel_values_agnostic = self.pixel_transforms(pixel_values_agnostic)
+            pixel_values_list.append(pixel_values)
+            pixel_values_pose_list.append(pixel_values_pose)
+            camera_parm_list.append(camera_parm)
+            pixel_values_agnostic_list.append(pixel_values_agnostic)
+        pixel_values = torch.stack(pixel_values_list)
+        pixel_values_pose = torch.stack(pixel_values_pose_list)
+        camera_parm = torch.stack(camera_parm_list)
+        pixel_values_agnostic = torch.stack(pixel_values_agnostic_list)
+        pixel_values_cloth_front = Image.open(os.path.join(self.cloth_root, cloth_name_front))
+        pixel_values_cloth_back = Image.open(os.path.join(self.cloth_root, cloth_name_back))
+        # clip
+        clip_ref_front = self.clip_image_processor(images=pixel_values_cloth_front, return_tensors="pt").pixel_values
+        clip_ref_back = self.clip_image_processor(images=pixel_values_cloth_back, return_tensors="pt").pixel_values
+        if self.is_train:
+            pixel_values_cloth_front = self.ref_transforms_train(pixel_values_cloth_front)
+            pixel_values_cloth_back = self.ref_transforms_train(pixel_values_cloth_back)
+        else:
+            pixel_values_cloth_front = self.ref_transforms_test(pixel_values_cloth_front)
+            pixel_values_cloth_back = self.ref_transforms_test(pixel_values_cloth_back)
+        drop_image_embeds = []
+        for k in range(len(select_images)):
+            if random.random() < 0.1:
+                drop_image_embeds.append(torch.tensor(1))
+            else:
+                drop_image_embeds.append(torch.tensor(0))
+        drop_image_embeds = torch.stack(drop_image_embeds)
+        sample = dict(
+            pixel_values=pixel_values,
+            pixel_values_pose=pixel_values_pose,
+            pixel_values_agnostic=pixel_values_agnostic,
+            clip_ref_front=clip_ref_front,
+            clip_ref_back=clip_ref_back,
+            pixel_values_cloth_front=pixel_values_cloth_front,
+            pixel_values_cloth_back=pixel_values_cloth_back,
+            camera_parm=camera_parm,
+            drop_image_embeds=drop_image_embeds,
+            img_name=image_name_list,
+            cloth_name=cloth_name_front,
+            )
+        return sample
+def collate_fn(data):
+    pixel_values = torch.stack([example["pixel_values"] for example in data])
+    pixel_values_pose = torch.stack([example["pixel_values_pose"] for example in data])
+    pixel_values_agnostic = torch.stack([example["pixel_values_agnostic"] for example in data])
+    clip_ref_front = torch.cat([example["clip_ref_front"] for example in data])
+    clip_ref_back = torch.cat([example["clip_ref_back"] for example in data])
+    pixel_values_cloth_front = torch.stack([example["pixel_values_cloth_front"] for example in data])
+    pixel_values_cloth_back = torch.stack([example["pixel_values_cloth_back"] for example in data])
+    camera_parm = torch.stack([example["camera_parm"] for example in data])
+    drop_image_embeds = [example["drop_image_embeds"] for example in data]
+    drop_image_embeds = torch.stack(drop_image_embeds)
+    img_name = []
+    cloth_name = []
+    for example in data:
+        img_name.extend(example['img_name'])
+        cloth_name.append(example['cloth_name'])
+    return {
+        "pixel_values": pixel_values,
+        "pixel_values_pose": pixel_values_pose,
+        "pixel_values_agnostic": pixel_values_agnostic,
+        "clip_ref_front": clip_ref_front,
+        "clip_ref_back": clip_ref_back,
+        "pixel_values_ref_front": pixel_values_cloth_front,
+        "pixel_values_ref_back": pixel_values_cloth_back,
+        "camera_parm": camera_parm,
+        "drop_image_embeds": drop_image_embeds,
+        "img_name": img_name,
+        "cloth_name": cloth_name,
+    }
+if __name__ == '__main__':
+    seed = 20
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    dataset = Thuman2_Dataset(dataroot="/GPUFS/sysu_gbli2_1/hzj/save_render_data_yw/",
+        sample_size=(768,576),is_train=False,mode='pair',
+        clip_model_path = "/GPUFS/sysu_gbli2_1/hzj/pretrained_models/clip-vit-base-patch32")
+    # for _ in range(500):
+        # p = random.randint(0,len(dataset)-1)
+        # p = dataset[p]
+    test_dataloader = torch.utils.data.DataLoader(
+        dataset,
+        shuffle=False,
+        collate_fn=collate_fn,
+        batch_size=2,
+        num_workers=1,
+    )
+    for _, batch in enumerate(test_dataloader):
+        p = {}
+        print('111', batch['camera_parm'].shape)
+        print('111', batch['drop_image_embeds'].shape)
+        for key in batch.keys():
+            p[key] = batch[key][0]
+        # p = dataset[12]
+        print(p['camera_parm'].shape)
+        pixel_values = p['pixel_values'][0].permute(1,2,0).numpy()
+        print(p['pixel_values'].shape)
+        pixel_values = pixel_values / 2 + 0.5
+        pixel_values *=255
+        pixel_values = pixel_values.astype(np.uint8)
+        pixel_values= Image.fromarray(pixel_values)
+        pixel_values.save('pixel_values0.jpg')
+        pixel_values_pose = p['pixel_values_pose'][0].permute(1,2,0).numpy()
+        print(p['pixel_values_pose'].shape)
+        pixel_values_pose = pixel_values_pose / 2 + 0.5
+        pixel_values_pose *=255
+        pixel_values_pose = pixel_values_pose.astype(np.uint8)
+        pixel_values_pose= Image.fromarray(pixel_values_pose)
+        pixel_values_pose.save('pixel_values_pose.jpg')
+        pixel_values_agnostic = p['pixel_values_agnostic'][0].permute(1,2,0).numpy()
+        print(p['pixel_values_agnostic'].shape)
+        pixel_values_agnostic = pixel_values_agnostic / 2 + 0.5
+        pixel_values_agnostic *=255
+        pixel_values_agnostic = pixel_values_agnostic.astype(np.uint8)
+        pixel_values_agnostic= Image.fromarray(pixel_values_agnostic)
+        pixel_values_agnostic.save('pixel_values_agnostic.jpg')
+        pixel_values = p['pixel_values'][2].permute(1,2,0).numpy()
+        print(p['pixel_values'].shape)
+        pixel_values = pixel_values / 2 + 0.5
+        pixel_values *=255
+        pixel_values = pixel_values.astype(np.uint8)
+        pixel_values= Image.fromarray(pixel_values)
+        pixel_values.save('pixel_values2.jpg')
+        pixel_values_pose = p['pixel_values_pose'][2].permute(1,2,0).numpy()
+        print(p['pixel_values_pose'].shape)
+        pixel_values_pose = pixel_values_pose / 2 + 0.5
+        pixel_values_pose *=255
+        pixel_values_pose = pixel_values_pose.astype(np.uint8)
+        pixel_values_pose= Image.fromarray(pixel_values_pose)
+        pixel_values_pose.save('pixel_values_pose2.jpg')
+        pixel_values_agnostic = p['pixel_values_agnostic'][2].permute(1,2,0).numpy()
+        print(p['pixel_values_agnostic'].shape)
+        pixel_values_agnostic = pixel_values_agnostic / 2 + 0.5
+        pixel_values_agnostic *=255
+        pixel_values_agnostic = pixel_values_agnostic.astype(np.uint8)
+        pixel_values_agnostic= Image.fromarray(pixel_values_agnostic)
+        pixel_values_agnostic.save('pixel_values_agnostic2.jpg')
+        pixel_values_cloth_img = p['pixel_values_ref_front'].permute(1,2,0).numpy()
+        print(p['pixel_values_ref_front'].shape)
+        pixel_values_cloth_img = pixel_values_cloth_img / 2 + 0.5
+        pixel_values_cloth_img *=255
+        pixel_values_cloth_img = pixel_values_cloth_img.astype(np.uint8)
+        pixel_values_cloth_img= Image.fromarray(pixel_values_cloth_img)
+        pixel_values_cloth_img.save('pixel_values_cloth_front.jpg')
+        pixel_values_cloth_img = p['pixel_values_ref_back'].permute(1,2,0).numpy()
+        print(p['pixel_values_ref_back'].shape)
+        pixel_values_cloth_img = pixel_values_cloth_img / 2 + 0.5
+        pixel_values_cloth_img *=255
+        pixel_values_cloth_img = pixel_values_cloth_img.astype(np.uint8)
+        pixel_values_cloth_img= Image.fromarray(pixel_values_cloth_img)
+        pixel_values_cloth_img.save('pixel_values_cloth_back.jpg')
+        exit()

src/multiview_consist_edit/config/infer_tryon_multi.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+seed: 42
+model_path: "stable-diffusion-v1-5/stable-diffusion-v1-5"
+vae_path: "stabilityai/sd-vae-ft-mse"
+clip_model_path: 'openai/clip-vit-base-patch32'
+# unet_path: "/GPUFS/sysu_gbli2_1/hzj/animate/checkpoints/thuman_tryon_mvattn_multi_1205/checkpoint-30000"
+# pretrained_poseguider_path: "/GPUFS/sysu_gbli2_1/hzj/animate/checkpoints/thuman_tryon_mvattn_multi_1205/checkpoint-30000/pose.ckpt"
+# pretrained_referencenet_path: '/GPUFS/sysu_gbli2_1/hzj/animate/checkpoints/thuman_tryon_mvattn_multi_1205/checkpoint-30000'
+unet_path: "./checkpoints/mvhumannet_tryon_mvattn_multi/checkpoint-40000"
+pretrained_poseguider_path: "./checkpoints/mvhumannet_tryon_mvattn_multi/checkpoint-40000/pose.ckpt"
+pretrained_referencenet_path: './checkpoints/mvhumannet_tryon_mvattn_multi/checkpoint-40000'
+out_dir: 'image_output_tryon_mvhumannet'
+batch_size: 2
+dataloader_num_workers: 4
+guidance_scale: 2  # thuman:3 mvhumannet:2
+# infer_data:
+#   # dataroot: "/GPUFS/sysu_gbli2_1/hzj/render_data"
+#   dataroot: "/GPUFS/sysu_gbli2_1/hzj/save_render_data_yw/"
+#   # sample_size:  [512,384] # for 40G 256
+#   sample_size:  [768,576]
+#   clip_model_path: '/GPUFS/sysu_gbli2_1/hzj/pretrained_models/clip-vit-base-patch32'
+#   is_train: false
+#   mode: 'pair'
+#   output_front: true
+infer_data:
+  # dataroot: "/GPUFS/sysu_gbli2_1/hzj/render_data"
+  dataroot: "../../demo_data/mvhumannet_2D_edit/"
+  # sample_size:  [512,384] # for 40G 256
+  sample_size:  [768,576]
+  clip_model_path: 'openai/clip-vit-base-patch32'
+  is_train: false
+  mode: 'pair'
+  output_front: true
+fusion_blocks: "full"
+image_finetune: true
+num_inference_steps: 30

src/multiview_consist_edit/config/train_tryon_multi.yaml ADDED Viewed

	@@ -0,0 +1,137 @@

+image_finetune: true
+from_scratch: false
+output_dir: "mvhumannet_tryon_mvattn_multi_1205"
+# output_dir: "mvhumannet_tryon_exp_multi_1028"
+logging_dir: "log"
+# pretrained_model_path: "/data1/hezijian/pretrained_models/stable-diffusion-v1-5"
+# pretrained_vae_path: "/data1/hezijian/pretrained_models/sd-vae-ft-mse"
+# pretrained_clip_path: '/data1/hezijian/pretrained_models/clip-vit-base-patch32'
+# clip_model_path: '/data1/hezijian/pretrained_models/clip-vit-base-patch32'
+pretrained_model_path: "/GPUFS/sysu_gbli2_1/hzj/pretrained_models/stable-diffusion-v1-5"
+pretrained_vae_path: "/GPUFS/sysu_gbli2_1/hzj/pretrained_models/sd-vae-ft-mse"
+pretrained_clip_path: '/GPUFS/sysu_gbli2_1/hzj/pretrained_models/clip-vit-base-patch32'
+clip_model_path: '/GPUFS/sysu_gbli2_1/hzj/pretrained_models/clip-vit-base-patch32'
+controlnet_model_name_or_path: null
+# trained stage1 model
+trained_unet_path: "checkpoints/thuman_tryon_exp_1015_two/checkpoint-120000"
+trained_referencenet_path: "checkpoints/thuman_tryon_exp_1015_two/checkpoint-120000"
+trained_pose_guider_path: 'checkpoints/thuman_tryon_exp_1015_two/checkpoint-120000/pose.ckpt'
+# trained_unet_path: "thuman_tryon_exp_1015_two/checkpoint-60000"
+# trained_referencenet_path: "thuman_tryon_exp_1015_two/checkpoint-60000"
+# trained_pose_guider_path: 'thuman_tryon_exp_1015_two/checkpoint-60000/pose.ckpt'
+unet_additional_kwargs:
+  use_motion_module              : false
+  motion_module_resolutions      : [ 1,2,4,8 ]
+  unet_use_cross_frame_attention : false
+  unet_use_temporal_attention    : false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads                : 8
+    num_transformer_block              : 1
+    attention_block_types              : [ "Temporal_Self", "Temporal_Self" ]
+    temporal_position_encoding         : true
+    temporal_position_encoding_max_len : 24
+    temporal_attention_dim_div         : 1
+    zero_initialize                    : true
+  encoder_hid_dim: 1280
+  encoder_hid_dim_type: 'text_proj'
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+  beta_start:          0.00085
+  beta_end:            0.012
+  beta_schedule:       "linear"
+  steps_offset:        1
+  clip_sample:         false
+train_data:
+  # dataroot: "/GPUFS/sysu_gbli2_1/hzj/render_data"
+  dataroot: "/GPUFS/sysu_gbli2_1/hzj/mvhumannet/"
+  # sample_size:  [512,384] # for 40G 256
+  sample_size:  [768,576]
+  clip_model_path: '/GPUFS/sysu_gbli2_1/hzj/pretrained_models/clip-vit-base-patch32'
+  is_train: true
+  mode: 'pair'
+# train_data:
+#   # dataroot: "/GPUFS/sysu_gbli2_1/hzj/render_data"
+#   dataroot: "/GPUFS/sysu_gbli2_1/hzj/save_render_data_yw/"
+#   # sample_size:  [512,384] # for 40G 256
+#   sample_size:  [768,576] # for 40G 256
+#   clip_model_path: '/GPUFS/sysu_gbli2_1/hzj/pretrained_models/clip-vit-base-patch32'
+#   is_train: true
+#   mode: 'pair'
+# train_data:
+#   # csv_path:     "./data/UBC_train_info_test.csv"
+#   csv_path:     "./data/TikTok_info.csv"
+#   video_folder: "../TikTok_dataset2/TikTok_dataset"
+#   sample_size:  512 # for 40G 256
+#   sample_stride: 4
+#   sample_n_frames: 8
+#   clip_model_path: 'pretrained_models/clip-vit-base-patch32'
+# train_data:
+#   # csv_path:     "./data/UBC_train_info_test.csv"
+#   csv_path:     "./data/UBC_train_info.csv"
+#   video_folder: "../UBC_dataset"
+#   sample_size:  512 # for 40G 256
+#   sample_stride: 4
+#   sample_n_frames: 8
+#   clip_model_path: 'pretrained_models/clip-vit-base-patch32'
+validation_data:
+  prompts:
+    - "Snow rocky mountains peaks canyon. Snow blanketed rocky mountains surround and shadow deep canyons."
+    - "A drone view of celebration with Christma tree and fireworks, starry sky - background."
+    - "Robot dancing in times square."
+    - "Pacific coast, carmel by the sea ocean and waves."
+  num_inference_steps: 25
+  guidance_scale: 8.
+trainable_modules:
+  # - "motion_modules."
+  - "."
+  # - "conv_in"
+fusion_blocks: "full"
+unet_checkpoint_path: ""
+scale_lr: false
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_weight_decay: 1.e-2
+adam_epsilon: 1.e-08
+learning_rate: 2.e-5
+train_batch_size: 1
+gradient_accumulation_steps: 2
+max_grad_norm: 1.0
+lr_scheduler: 'constant'
+lr_warmup_steps: 0
+num_train_epochs:     10000
+max_train_steps:      null
+checkpointing_steps:  2000
+validation_steps:       5000
+validation_steps_tuple: [2, 50]
+seed: 42
+mixed_precision_training: true
+enable_xformers_memory_efficient_attention: True
+is_debug: False
+checkpoints_total_limit: 10
+mixed_precision: "fp16"
+report_to: "tensorboard"
+allow_tf32: true
+resume_from_checkpoint: 'latest'
+# resume_from_checkpoint: null
+dataloader_num_workers: 8

src/multiview_consist_edit/data/MVHumanNet_multi.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import os, io, csv, math, random
+import numpy as np
+from PIL import Image,ImageDraw
+import json
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data.dataset import Dataset
+from transformers import CLIPProcessor
+import random
+from torchvision.transforms import functional as F
+import torch.distributed as dist
+import copy
+import cv2
+import pickle
+from .camera_utils import read_camera_mvhumannet
+def crop_and_resize(img, bbox, size):
+    # 计算中心点和新的宽高
+    center_x = (bbox[0] + bbox[2]) / 2
+    center_y = (bbox[1] + bbox[3]) / 2
+    new_height = bbox[3] - bbox[1]
+    new_width = int(new_height * (2 / 3))
+    # 计算新的边界框
+    new_bbox = [
+        int(center_x - new_width / 2),
+        int(center_y - new_height / 2),
+        int(center_x + new_width / 2),
+        int(center_y + new_height / 2)
+    ]
+    # 裁剪图像
+    cropped_img = img.crop(new_bbox)
+    # 调整大小
+    resized_img = cropped_img.resize(size)
+    return resized_img
+class MVHumanNet_Dataset(Dataset):
+    def __init__(
+        self, dataroot, sample_size=(512,384), is_train=True, mode='pair', clip_model_path='', multi_length=8, output_front=True,
+    ):
+        im_names = []
+        self.dataroot = os.path.join(dataroot, 'processed_mvhumannet')
+        self.cloth_root = os.path.join(dataroot, 'cloth')
+        self.data_ids = []
+        self.data_frame_ids = []
+        self.cloth_ids = []
+        self.cloth_frame_ids = []
+        if is_train:
+            f = open(os.path.join(dataroot,'train_frame_ids.txt'))
+            for line in f.readlines():
+                line_info = line.strip().split()
+                self.data_ids.append(line_info[0])
+                self.data_frame_ids.append(line_info[1])
+            f.close()
+        else:
+            f = open(os.path.join(dataroot, 'test_ids.txt'))
+            for line in f.readlines():
+                line_info = line.strip().split()
+                self.data_ids.append(line_info[0])
+                self.data_frame_ids.append(line_info[1])
+            f.close()
+            f2 = open(os.path.join(dataroot, 'test_cloth_ids.txt'))
+            # f2 = open(os.path.join(dataroot, 'test_mvg_cloth_ids.txt'))
+            for line in f2.readlines():
+                line_info = line.strip().split()
+                self.cloth_ids.append(line_info[0])
+                self.cloth_frame_ids.append(line_info[1])
+            f2.close()
+        self.is_train = is_train
+        self.sample_size = sample_size
+        self.multi_length = multi_length
+        self.clip_image_processor = CLIPProcessor.from_pretrained(clip_model_path,local_files_only=False)
+        self.pixel_transforms = transforms.Compose([
+            #transforms.Resize((1024,768), interpolation=0),
+            #transforms.CenterCrop((int(1024 * 6/8), int(768 * 6/8))),
+            transforms.Resize(self.sample_size, interpolation=0),
+            # transforms.CenterCrop(self.sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.pixel_transforms_0 = transforms.Compose([
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.pixel_transforms_1 = transforms.Compose([
+            # transforms.Resize((1024,768), interpolation=0),
+            # transforms.CenterCrop((int(1024 * 6/8), int(768 * 6/8))),
+            transforms.Resize(self.sample_size, interpolation=0),
+        ])
+        self.ref_transforms_train = transforms.Compose([
+            transforms.Resize(self.sample_size),
+            # RandomScaleResize([1.0,1.1]),
+            # transforms.CenterCrop(self.sample_size),
+            transforms.RandomAffine(degrees=0, translate=(0.08,0.08),scale=(0.9,1.1)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.ref_transforms_test = transforms.Compose([
+            transforms.Resize(self.sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.output_front = True
+    def __len__(self):
+        if len(self.cloth_ids) >= 1:
+            return len(self.data_ids)*len(self.cloth_ids)
+        else:
+            return len(self.data_ids)
+    def __getitem__(self, idx):
+        if len(self.cloth_ids) >=1:
+            data_idx = idx // len(self.cloth_ids)
+            cloth_idx = idx % len(self.cloth_ids)
+            data_id = self.data_ids[data_idx]
+            frame_id = self.data_frame_ids[data_idx]
+            cloth_id = self.cloth_ids[cloth_idx]
+            cloth_frame_id = self.cloth_frame_ids[cloth_idx]
+            cloth_name_front = os.path.join(self.cloth_root, '%s_%s_front.jpg' % (cloth_id, cloth_frame_id))  # 实际是反的
+            cloth_name_back = os.path.join(self.cloth_root, '%s_%s_back.jpg' % (cloth_id, cloth_frame_id))
+        else:
+            data_id = self.data_ids[idx]
+            frame_id = self.data_frame_ids[idx]
+            cloth_name_front = os.path.join(self.cloth_root, '%s_%s_front.jpg' % (data_id, frame_id))  # 实际是反的
+            cloth_name_back = os.path.join(self.cloth_root, '%s_%s_back.jpg' % (data_id, frame_id))
+        # cloth_name_front = os.path.join(self.cloth_root, '%s_%s_front.jpg' % ('100030', '0540'))
+        # cloth_name_back = os.path.join(self.cloth_root, '%s_%s_back.jpg' % ('100030', '0540'))
+        images_root = os.path.join(self.dataroot, data_id, 'agnostic', frame_id)
+        images = sorted(os.listdir(images_root))
+        if self.is_train:
+            check_images = []
+            for image in images:
+                if 'CC32871A015' not in image:
+                    check_images.append(image)
+            select_images = random.sample(check_images, self.multi_length)
+        else:
+            # front
+            front_cameras = [
+                'CC32871A005','CC32871A016','CC32871A017','CC32871A023','CC32871A027',
+                'CC32871A030','CC32871A032','CC32871A033','CC32871A034','CC32871A035',
+                'CC32871A038','CC32871A050','CC32871A051','CC32871A052','CC32871A059', 'CC32871A060'
+            ]
+            back_cameras = [
+                'CC32871A004','CC32871A010', 'CC32871A013', 'CC32871A022', 'CC32871A029',
+                'CC32871A031','CC32871A037', 'CC32871A039', 'CC32871A040', 'CC32871A044',
+                'CC32871A046','CC32871A048', 'CC32871A055', 'CC32871A057', 'CC32871A058', 'CC32871A041'
+            ]
+            select_images = []
+            for image in images:
+                camera_id = image.split('_')[0]
+                if camera_id in front_cameras and self.output_front:
+                    select_images.append(image)
+                if camera_id in back_cameras and not self.output_front:
+                    select_images.append(image)
+        select_images = sorted(select_images)
+        # print(select_images)
+        for i in range(len(select_images)):
+            select_images[i] = os.path.join(data_id,'resized_img', frame_id, select_images[i])
+        sample = self.load_images(select_images, data_id, cloth_name_front, cloth_name_back)
+        return sample
+    def load_images(self, select_images, data_id, cloth_name_front, cloth_name_back):
+        pixel_values_list = []
+        pixel_values_pose_list = []
+        camera_parm_list = []
+        pixel_values_agnostic_list = []
+        image_name_list = []
+        # load camera info
+        intri_name = os.path.join(self.dataroot, data_id, 'camera_intrinsics.json')
+        extri_name = os.path.join(self.dataroot, data_id, 'camera_extrinsics.json')
+        camera_scale_fn = os.path.join(self.dataroot, data_id, 'camera_scale.pkl')
+        camera_scale = pickle.load(open(camera_scale_fn, "rb"))
+        cameras_gt = read_camera_mvhumannet(intri_name, extri_name, camera_scale)
+        # load person data
+        for img_name in select_images:
+            camera_id = img_name.split('\\')[-1].split('_')[0]
+            # load data
+            image_name_list.append(img_name)
+            pixel_values = Image.open(os.path.join(self.dataroot, img_name))
+            pixel_values_pose = Image.open(os.path.join(self.dataroot, img_name).replace('resized_img', 'normals').replace('.jpg','_normal.jpg'))
+            pixel_values_agnostic = Image.open(os.path.join(self.dataroot, img_name).replace('resized_img', 'agnostic'))
+            parm_matrix = cameras_gt[camera_id]['RT']  # extrinsic
+            # crop pose
+            annot_path = os.path.join(self.dataroot, img_name.replace('resized_img', 'annots').replace('.jpg','.json'))
+            annot_info = json.load(open(annot_path))
+            bbox = annot_info['annots'][0]['bbox']
+            width = annot_info['width']
+            if width == 4096 or width == 2448:
+                for i in range(4):
+                    bbox[i] = bbox[i] // 2
+            elif width == 2048:
+                pass
+            else:
+                print('wrong annot size',img_path)
+            pixel_values_pose = crop_and_resize(pixel_values_pose, bbox, size=self.sample_size)
+            # camera parameter
+            parm_matrix = torch.tensor(parm_matrix)
+            camera_parm = parm_matrix[:3,:3].reshape(-1) # todo
+            # transform
+            pixel_values = self.pixel_transforms(pixel_values)
+            pixel_values_pose = self.pixel_transforms(pixel_values_pose)
+            pixel_values_agnostic = self.pixel_transforms(pixel_values_agnostic)
+            pixel_values_list.append(pixel_values)
+            pixel_values_pose_list.append(pixel_values_pose)
+            camera_parm_list.append(camera_parm)
+            pixel_values_agnostic_list.append(pixel_values_agnostic)
+        pixel_values = torch.stack(pixel_values_list)
+        pixel_values_pose = torch.stack(pixel_values_pose_list)
+        camera_parm = torch.stack(camera_parm_list)
+        pixel_values_agnostic = torch.stack(pixel_values_agnostic_list)
+        pixel_values_cloth_front = Image.open(cloth_name_front)
+        pixel_values_cloth_back = Image.open(cloth_name_back)
+        # clip
+        clip_ref_front = self.clip_image_processor(images=pixel_values_cloth_front, return_tensors="pt").pixel_values
+        clip_ref_back = self.clip_image_processor(images=pixel_values_cloth_back, return_tensors="pt").pixel_values
+        if self.is_train:
+            pixel_values_cloth_front = self.ref_transforms_train(pixel_values_cloth_front)
+            pixel_values_cloth_back = self.ref_transforms_train(pixel_values_cloth_back)
+        else:
+            pixel_values_cloth_front = self.ref_transforms_test(pixel_values_cloth_front)
+            pixel_values_cloth_back = self.ref_transforms_test(pixel_values_cloth_back)
+        drop_image_embeds = []
+        for k in range(len(select_images)):
+            if random.random() < 0.1:
+                drop_image_embeds.append(torch.tensor(1))
+            else:
+                drop_image_embeds.append(torch.tensor(0))
+        drop_image_embeds = torch.stack(drop_image_embeds)
+        sample = dict(
+            pixel_values=pixel_values,
+            pixel_values_pose=pixel_values_pose,
+            pixel_values_agnostic=pixel_values_agnostic,
+            clip_ref_front=clip_ref_front,
+            clip_ref_back=clip_ref_back,
+            pixel_values_cloth_front=pixel_values_cloth_front,
+            pixel_values_cloth_back=pixel_values_cloth_back,
+            camera_parm=camera_parm,
+            drop_image_embeds=drop_image_embeds,
+            img_name=image_name_list,
+            cloth_name=cloth_name_front,
+            )
+        return sample
+def collate_fn(data):
+    pixel_values = torch.stack([example["pixel_values"] for example in data])
+    pixel_values_pose = torch.stack([example["pixel_values_pose"] for example in data])
+    pixel_values_agnostic = torch.stack([example["pixel_values_agnostic"] for example in data])
+    clip_ref_front = torch.cat([example["clip_ref_front"] for example in data])
+    clip_ref_back = torch.cat([example["clip_ref_back"] for example in data])
+    pixel_values_cloth_front = torch.stack([example["pixel_values_cloth_front"] for example in data])
+    pixel_values_cloth_back = torch.stack([example["pixel_values_cloth_back"] for example in data])
+    camera_parm = torch.stack([example["camera_parm"] for example in data])
+    drop_image_embeds = [example["drop_image_embeds"] for example in data]
+    drop_image_embeds = torch.stack(drop_image_embeds)
+    img_name = []
+    cloth_name = []
+    for example in data:
+        img_name.extend(example['img_name'])
+        cloth_name.append(example['cloth_name'])
+    return {
+        "pixel_values": pixel_values,
+        "pixel_values_pose": pixel_values_pose,
+        "pixel_values_agnostic": pixel_values_agnostic,
+        "clip_ref_front": clip_ref_front,
+        "clip_ref_back": clip_ref_back,
+        "pixel_values_ref_front": pixel_values_cloth_front,
+        "pixel_values_ref_back": pixel_values_cloth_back,
+        "camera_parm": camera_parm,
+        "drop_image_embeds": drop_image_embeds,
+        "img_name": img_name,
+        "cloth_name": cloth_name,
+    }
+if __name__ == '__main__':
+    seed = 20
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    dataset = MVHumanNet_Dataset(dataroot="/GPUFS/sysu_gbli2_1/hzj/mvhumannet/",
+        sample_size=(768,576),is_train=True,mode='pair',
+        clip_model_path = "/GPUFS/sysu_gbli2_1/hzj/pretrained_models/clip-vit-base-patch32")
+    # print(len(dataset))
+    # for _ in range(500):
+    #     p = random.randint(0,len(dataset)-1)
+    #     p = dataset[p]
+    test_dataloader = torch.utils.data.DataLoader(
+        dataset,
+        shuffle=False,
+        collate_fn=collate_fn,
+        batch_size=1,
+        num_workers=2,
+    )
+    for _, batch in enumerate(test_dataloader):
+        # print(batch['cloth_name'], batch['img_name'])
+        p = {}
+        print('111', batch['camera_parm'].shape)
+        print('111', batch['drop_image_embeds'].shape)
+        for key in batch.keys():
+            p[key] = batch[key][0]
+        # p = dataset[12]
+        print(p['camera_parm'].shape)
+        pixel_values = p['pixel_values'][0].permute(1,2,0).numpy()
+        print(p['pixel_values'].shape)
+        pixel_values = pixel_values / 2 + 0.5
+        pixel_values *=255
+        pixel_values = pixel_values.astype(np.uint8)
+        pixel_values= Image.fromarray(pixel_values)
+        pixel_values.save('pixel_values0.jpg')
+        pixel_values_pose = p['pixel_values_pose'][0].permute(1,2,0).numpy()
+        print(p['pixel_values_pose'].shape)
+        pixel_values_pose = pixel_values_pose / 2 + 0.5
+        pixel_values_pose *=255
+        pixel_values_pose = pixel_values_pose.astype(np.uint8)
+        pixel_values_pose= Image.fromarray(pixel_values_pose)
+        pixel_values_pose.save('pixel_values_pose.jpg')
+        pixel_values_agnostic = p['pixel_values_agnostic'][0].permute(1,2,0).numpy()
+        print(p['pixel_values_agnostic'].shape)
+        pixel_values_agnostic = pixel_values_agnostic / 2 + 0.5
+        pixel_values_agnostic *=255
+        pixel_values_agnostic = pixel_values_agnostic.astype(np.uint8)
+        pixel_values_agnostic= Image.fromarray(pixel_values_agnostic)
+        pixel_values_agnostic.save('pixel_values_agnostic.jpg')
+        pixel_values = p['pixel_values'][2].permute(1,2,0).numpy()
+        print(p['pixel_values'].shape)
+        pixel_values = pixel_values / 2 + 0.5
+        pixel_values *=255
+        pixel_values = pixel_values.astype(np.uint8)
+        pixel_values= Image.fromarray(pixel_values)
+        pixel_values.save('pixel_values2.jpg')
+        pixel_values_pose = p['pixel_values_pose'][2].permute(1,2,0).numpy()
+        print(p['pixel_values_pose'].shape)
+        pixel_values_pose = pixel_values_pose / 2 + 0.5
+        pixel_values_pose *=255
+        pixel_values_pose = pixel_values_pose.astype(np.uint8)
+        pixel_values_pose= Image.fromarray(pixel_values_pose)
+        pixel_values_pose.save('pixel_values_pose2.jpg')
+        pixel_values_agnostic = p['pixel_values_agnostic'][2].permute(1,2,0).numpy()
+        print(p['pixel_values_agnostic'].shape)
+        pixel_values_agnostic = pixel_values_agnostic / 2 + 0.5
+        pixel_values_agnostic *=255
+        pixel_values_agnostic = pixel_values_agnostic.astype(np.uint8)
+        pixel_values_agnostic= Image.fromarray(pixel_values_agnostic)
+        pixel_values_agnostic.save('pixel_values_agnostic2.jpg')
+        pixel_values_cloth_img = p['pixel_values_ref_front'].permute(1,2,0).numpy()
+        print(p['pixel_values_ref_front'].shape)
+        pixel_values_cloth_img = pixel_values_cloth_img / 2 + 0.5
+        pixel_values_cloth_img *=255
+        pixel_values_cloth_img = pixel_values_cloth_img.astype(np.uint8)
+        pixel_values_cloth_img= Image.fromarray(pixel_values_cloth_img)
+        pixel_values_cloth_img.save('pixel_values_cloth_front.jpg')
+        pixel_values_cloth_img = p['pixel_values_ref_back'].permute(1,2,0).numpy()
+        print(p['pixel_values_ref_back'].shape)
+        pixel_values_cloth_img = pixel_values_cloth_img / 2 + 0.5
+        pixel_values_cloth_img *=255
+        pixel_values_cloth_img = pixel_values_cloth_img.astype(np.uint8)
+        pixel_values_cloth_img= Image.fromarray(pixel_values_cloth_img)
+        pixel_values_cloth_img.save('pixel_values_cloth_back.jpg')
+        exit()

src/multiview_consist_edit/data/Thuman2_multi.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import os, io, csv, math, random
+import numpy as np
+from PIL import Image,ImageDraw
+import json
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from torch.utils.data.dataset import Dataset
+from transformers import CLIPProcessor
+import random
+from torchvision.transforms import functional as F
+import torch.distributed as dist
+import copy
+import cv2
+def crop_image(human_img_orig):
+    human_img_orig = human_img_orig.resize((1024,1024))
+    original_width, original_height = human_img_orig.size
+    target_width = 768
+    crop_amount = (original_width - target_width) // 2
+    left = crop_amount
+    upper = 0
+    right = original_width - crop_amount
+    lower = original_height
+    cropped_image = human_img_orig.crop((left, upper, right, lower))
+    return cropped_image
+class Thuman2_Dataset(Dataset):
+    def __init__(
+        self, dataroot, sample_size=(512,384), is_train=True, mode='pair', clip_model_path='', multi_length=8, output_front=True,
+    ):
+        c_names_front = []
+        c_names_back = []
+        self.data_ids = []
+        self.dataroot = os.path.join(dataroot, 'all')
+        self.cloth_root = os.path.join(dataroot, 'cloth')
+        # self.cloth_root = os.path.join(dataroot, 'MVG_clothes')
+        self.cloth_ids = []
+        if is_train:
+            f = open(os.path.join(dataroot,'train_ids.txt'))
+            for line in f.readlines():
+                self.data_ids.append(line.strip())
+            f.close()
+        else:
+            # f = open(os.path.join(dataroot, 'val_ids.txt'))
+            f = open(os.path.join(dataroot, 'test_ids.txt'))
+            # f = open(os.path.join(dataroot, 'test_mvg_ids.txt'))
+            for line in f.readlines():
+                self.data_ids.append(line.strip())
+            f.close()
+            f2 = open(os.path.join(dataroot, 'test_cloth_ids.txt'))
+            # f2 = open(os.path.join(dataroot, 'test_mvg_cloth_ids.txt'))
+            for line in f2.readlines():
+                self.cloth_ids.append(line.strip())
+            f2.close()
+        self.mode = mode
+        self.is_train = is_train
+        self.sample_size = sample_size
+        self.multi_length = multi_length
+        self.clip_image_processor = CLIPProcessor.from_pretrained(clip_model_path,local_files_only=True)
+        self.pixel_transforms = transforms.Compose([
+            transforms.Resize((1024,768), interpolation=0),
+            transforms.CenterCrop((int(1024 * 6/8), int(768 * 6/8))),
+            transforms.Resize(self.sample_size, interpolation=0),
+            # transforms.CenterCrop(self.sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.pixel_transforms_0 = transforms.Compose([
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.pixel_transforms_1 = transforms.Compose([
+            transforms.Resize((1024,768), interpolation=0),
+            transforms.CenterCrop((int(1024 * 6/8), int(768 * 6/8))),
+            transforms.Resize(self.sample_size, interpolation=0),
+        ])
+        self.ref_transforms_train = transforms.Compose([
+            transforms.Resize(self.sample_size),
+            # RandomScaleResize([1.0,1.1]),
+            transforms.CenterCrop(self.sample_size),
+            transforms.RandomAffine(degrees=0, translate=(0.08,0.08),scale=(0.9,1.1)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.ref_transforms_test = transforms.Compose([
+            transforms.Resize(self.sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+        self.color_transform = transforms.ColorJitter(brightness=0.3, contrast=0.2, saturation=0.2, hue=0.0)
+        self.output_front = True
+    def __len__(self):
+        if len(self.cloth_ids) >= 1:
+            return len(self.data_ids)*len(self.cloth_ids)
+        else:
+            return len(self.data_ids)
+    def __getitem__(self, idx):
+        if len(self.cloth_ids) >=1:
+            data_idx = idx // len(self.cloth_ids)
+            cloth_idx = idx % len(self.cloth_ids)
+            data_id = self.data_ids[data_idx]
+            cloth_id = self.cloth_ids[cloth_idx]
+            cloth_name_back = os.path.join(self.cloth_root, '%s_front.jpg' % cloth_id)
+            cloth_name_front =  os.path.join(self.cloth_root, '%s_back.jpg' % cloth_id)
+        else:
+            data_id = self.data_ids[idx]
+            cloth_name_back = os.path.join(self.cloth_root, '%s_front.jpg' % data_id)
+            cloth_name_front =  os.path.join(self.cloth_root, '%s_back.jpg' % data_id)
+        images_root = os.path.join(self.dataroot, data_id, 'agnostic') # need only val
+        images = sorted(os.listdir(images_root))
+        # cloth_name_back = '0001_front.jpg'
+        # cloth_name_front = '0001_back.jpg'
+        if self.is_train:
+            select_images = random.sample(images, self.multi_length)
+        else:
+            # select_idxs = [0,3,6,9,12, 15,18,21,24,27, 79,76,73,70,67,64]
+            L = len(images)
+            select_idxs = []
+            begin = 0
+            sl = 16.0
+            if self.output_front:
+                while begin < L//2:
+                    select_idxs.append(int(begin/2))
+                    select_idxs.append(int(L-1-begin/2))
+                    begin += L/sl
+            else:
+                begin = L//4
+                while begin < L*3//4:
+                    select_idxs.append(int(begin))
+                    begin += L/2/sl
+            # print(sorted(select_idxs))
+            # select_idxs = [0,3,6,9,12, 15,18,21,24,27, L-1,L-4,L-7,L-10,L-13,L-16]
+            select_images = []
+            for select_idx in select_idxs:
+                select_images.append(images[select_idx])
+        select_images = sorted(select_images)
+        # print(select_images)
+        for i in range(len(select_images)):
+            select_images[i] = os.path.join(data_id,'images',select_images[i])
+        sample = self.load_images(select_images, cloth_name_front, cloth_name_back)
+        return sample
+    def color_progress(images):
+        fn_idx, b, c, s, h = self.color_transform.get_params(color_jitter.brightness, color_jitter.contrast, color_jitter.saturation,color_jitter.hue)
+        for image in images:
+            image = F.adjust_contrast(image, c)
+            image = F.adjust_brightness(image, b)
+            image = F.adjust_saturation(image, s)
+        return images
+    def load_images(self, select_images, cloth_name_front, cloth_name_back):
+        pixel_values_list = []
+        pixel_values_pose_list = []
+        camera_parm_list = []
+        pixel_values_agnostic_list = []
+        image_name_list = []
+        # load person data
+        for img_name in select_images:
+            image_name_list.append(img_name)
+            pixel_values = Image.open(os.path.join(self.dataroot, img_name))
+            pixel_values_pose = Image.open(os.path.join(self.dataroot, img_name).replace('images', 'normals'))
+            # parse_lip = Image.open(os.path.join(parse_lip_dir, img_name))
+            pixel_values_agnostic = Image.open(os.path.join(self.dataroot, img_name).replace('images', 'agnostic'))
+            parm_matrix = np.load(os.path.join(self.dataroot, img_name[:4],'parm', img_name[-7:-4]+'_extrinsic.npy'))
+            pixel_values = crop_image(pixel_values)
+            pixel_values_pose = crop_image(pixel_values_pose)
+            # camera parameter
+            parm_matrix = torch.tensor(parm_matrix)
+            camera_parm = parm_matrix[:3,:3].reshape(-1) # todo
+            # transform
+            pixel_values = self.pixel_transforms(pixel_values)
+            pixel_values_pose = self.pixel_transforms(pixel_values_pose)
+            pixel_values_agnostic = self.pixel_transforms(pixel_values_agnostic)
+            pixel_values_list.append(pixel_values)
+            pixel_values_pose_list.append(pixel_values_pose)
+            camera_parm_list.append(camera_parm)
+            pixel_values_agnostic_list.append(pixel_values_agnostic)
+        pixel_values = torch.stack(pixel_values_list)
+        pixel_values_pose = torch.stack(pixel_values_pose_list)
+        camera_parm = torch.stack(camera_parm_list)
+        pixel_values_agnostic = torch.stack(pixel_values_agnostic_list)
+        pixel_values_cloth_front = Image.open(os.path.join(self.cloth_root, cloth_name_front))
+        pixel_values_cloth_back = Image.open(os.path.join(self.cloth_root, cloth_name_back))
+        # clip
+        clip_ref_front = self.clip_image_processor(images=pixel_values_cloth_front, return_tensors="pt").pixel_values
+        clip_ref_back = self.clip_image_processor(images=pixel_values_cloth_back, return_tensors="pt").pixel_values
+        if self.is_train:
+            pixel_values_cloth_front = self.ref_transforms_train(pixel_values_cloth_front)
+            pixel_values_cloth_back = self.ref_transforms_train(pixel_values_cloth_back)
+        else:
+            pixel_values_cloth_front = self.ref_transforms_test(pixel_values_cloth_front)
+            pixel_values_cloth_back = self.ref_transforms_test(pixel_values_cloth_back)
+        drop_image_embeds = []
+        for k in range(len(select_images)):
+            if random.random() < 0.1:
+                drop_image_embeds.append(torch.tensor(1))
+            else:
+                drop_image_embeds.append(torch.tensor(0))
+        drop_image_embeds = torch.stack(drop_image_embeds)
+        sample = dict(
+            pixel_values=pixel_values,
+            pixel_values_pose=pixel_values_pose,
+            pixel_values_agnostic=pixel_values_agnostic,
+            clip_ref_front=clip_ref_front,
+            clip_ref_back=clip_ref_back,
+            pixel_values_cloth_front=pixel_values_cloth_front,
+            pixel_values_cloth_back=pixel_values_cloth_back,
+            camera_parm=camera_parm,
+            drop_image_embeds=drop_image_embeds,
+            img_name=image_name_list,
+            cloth_name=cloth_name_front,
+            )
+        return sample
+def collate_fn(data):
+    pixel_values = torch.stack([example["pixel_values"] for example in data])
+    pixel_values_pose = torch.stack([example["pixel_values_pose"] for example in data])
+    pixel_values_agnostic = torch.stack([example["pixel_values_agnostic"] for example in data])
+    clip_ref_front = torch.cat([example["clip_ref_front"] for example in data])
+    clip_ref_back = torch.cat([example["clip_ref_back"] for example in data])
+    pixel_values_cloth_front = torch.stack([example["pixel_values_cloth_front"] for example in data])
+    pixel_values_cloth_back = torch.stack([example["pixel_values_cloth_back"] for example in data])
+    camera_parm = torch.stack([example["camera_parm"] for example in data])
+    drop_image_embeds = [example["drop_image_embeds"] for example in data]
+    drop_image_embeds = torch.stack(drop_image_embeds)
+    img_name = []
+    cloth_name = []
+    for example in data:
+        img_name.extend(example['img_name'])
+        cloth_name.append(example['cloth_name'])
+    return {
+        "pixel_values": pixel_values,
+        "pixel_values_pose": pixel_values_pose,
+        "pixel_values_agnostic": pixel_values_agnostic,
+        "clip_ref_front": clip_ref_front,
+        "clip_ref_back": clip_ref_back,
+        "pixel_values_ref_front": pixel_values_cloth_front,
+        "pixel_values_ref_back": pixel_values_cloth_back,
+        "camera_parm": camera_parm,
+        "drop_image_embeds": drop_image_embeds,
+        "img_name": img_name,
+        "cloth_name": cloth_name,
+    }
+if __name__ == '__main__':
+    seed = 20
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    dataset = Thuman2_Dataset(dataroot="/GPUFS/sysu_gbli2_1/hzj/save_render_data_yw/",
+        sample_size=(768,576),is_train=False,mode='pair',
+        clip_model_path = "/GPUFS/sysu_gbli2_1/hzj/pretrained_models/clip-vit-base-patch32")
+    # for _ in range(500):
+        # p = random.randint(0,len(dataset)-1)
+        # p = dataset[p]
+    test_dataloader = torch.utils.data.DataLoader(
+        dataset,
+        shuffle=False,
+        collate_fn=collate_fn,
+        batch_size=2,
+        num_workers=1,
+    )
+    for _, batch in enumerate(test_dataloader):
+        p = {}
+        print('111', batch['camera_parm'].shape)
+        print('111', batch['drop_image_embeds'].shape)
+        for key in batch.keys():
+            p[key] = batch[key][0]
+        # p = dataset[12]
+        print(p['camera_parm'].shape)
+        pixel_values = p['pixel_values'][0].permute(1,2,0).numpy()
+        print(p['pixel_values'].shape)
+        pixel_values = pixel_values / 2 + 0.5
+        pixel_values *=255
+        pixel_values = pixel_values.astype(np.uint8)
+        pixel_values= Image.fromarray(pixel_values)
+        pixel_values.save('pixel_values0.jpg')
+        pixel_values_pose = p['pixel_values_pose'][0].permute(1,2,0).numpy()
+        print(p['pixel_values_pose'].shape)
+        pixel_values_pose = pixel_values_pose / 2 + 0.5
+        pixel_values_pose *=255
+        pixel_values_pose = pixel_values_pose.astype(np.uint8)
+        pixel_values_pose= Image.fromarray(pixel_values_pose)
+        pixel_values_pose.save('pixel_values_pose.jpg')
+        pixel_values_agnostic = p['pixel_values_agnostic'][0].permute(1,2,0).numpy()
+        print(p['pixel_values_agnostic'].shape)
+        pixel_values_agnostic = pixel_values_agnostic / 2 + 0.5
+        pixel_values_agnostic *=255
+        pixel_values_agnostic = pixel_values_agnostic.astype(np.uint8)
+        pixel_values_agnostic= Image.fromarray(pixel_values_agnostic)
+        pixel_values_agnostic.save('pixel_values_agnostic.jpg')
+        pixel_values = p['pixel_values'][2].permute(1,2,0).numpy()
+        print(p['pixel_values'].shape)
+        pixel_values = pixel_values / 2 + 0.5
+        pixel_values *=255
+        pixel_values = pixel_values.astype(np.uint8)
+        pixel_values= Image.fromarray(pixel_values)
+        pixel_values.save('pixel_values2.jpg')
+        pixel_values_pose = p['pixel_values_pose'][2].permute(1,2,0).numpy()
+        print(p['pixel_values_pose'].shape)
+        pixel_values_pose = pixel_values_pose / 2 + 0.5
+        pixel_values_pose *=255
+        pixel_values_pose = pixel_values_pose.astype(np.uint8)
+        pixel_values_pose= Image.fromarray(pixel_values_pose)
+        pixel_values_pose.save('pixel_values_pose2.jpg')
+        pixel_values_agnostic = p['pixel_values_agnostic'][2].permute(1,2,0).numpy()
+        print(p['pixel_values_agnostic'].shape)
+        pixel_values_agnostic = pixel_values_agnostic / 2 + 0.5
+        pixel_values_agnostic *=255
+        pixel_values_agnostic = pixel_values_agnostic.astype(np.uint8)
+        pixel_values_agnostic= Image.fromarray(pixel_values_agnostic)
+        pixel_values_agnostic.save('pixel_values_agnostic2.jpg')
+        pixel_values_cloth_img = p['pixel_values_ref_front'].permute(1,2,0).numpy()
+        print(p['pixel_values_ref_front'].shape)
+        pixel_values_cloth_img = pixel_values_cloth_img / 2 + 0.5
+        pixel_values_cloth_img *=255
+        pixel_values_cloth_img = pixel_values_cloth_img.astype(np.uint8)
+        pixel_values_cloth_img= Image.fromarray(pixel_values_cloth_img)
+        pixel_values_cloth_img.save('pixel_values_cloth_front.jpg')
+        pixel_values_cloth_img = p['pixel_values_ref_back'].permute(1,2,0).numpy()
+        print(p['pixel_values_ref_back'].shape)
+        pixel_values_cloth_img = pixel_values_cloth_img / 2 + 0.5
+        pixel_values_cloth_img *=255
+        pixel_values_cloth_img = pixel_values_cloth_img.astype(np.uint8)
+        pixel_values_cloth_img= Image.fromarray(pixel_values_cloth_img)
+        pixel_values_cloth_img.save('pixel_values_cloth_back.jpg')
+        exit()

src/multiview_consist_edit/data/camera_utils.py ADDED Viewed

	@@ -0,0 +1,479 @@

+import cv2
+import numpy as np
+import os
+from os.path import join
+class FileStorage(object):
+    def __init__(self, filename, isWrite=False):
+        version = cv2.__version__
+        self.major_version = int(version.split('.')[0])
+        self.second_version = int(version.split('.')[1])
+        if isWrite:
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            self.fs = open(filename, 'w')
+            self.fs.write('%YAML:1.0\r\n')
+            self.fs.write('---\r\n')
+        else:
+            assert os.path.exists(filename), filename
+            self.fs = cv2.FileStorage(filename, cv2.FILE_STORAGE_READ)
+        self.isWrite = isWrite
+    def __del__(self):
+        if self.isWrite:
+            self.fs.close()
+        else:
+            cv2.FileStorage.release(self.fs)
+    def _write(self, out):
+        self.fs.write(out+'\r\n')
+    def write(self, key, value, dt='mat'):
+        if dt == 'mat':
+            self._write('{}: !!opencv-matrix'.format(key))
+            self._write('  rows: {}'.format(value.shape[0]))
+            self._write('  cols: {}'.format(value.shape[1]))
+            self._write('  dt: d')
+            self._write('  data: [{}]'.format(', '.join(['{:.6f}'.format(i) for i in value.reshape(-1)])))
+        elif dt == 'list':
+            self._write('{}:'.format(key))
+            for elem in value:
+                self._write('  - "{}"'.format(elem))
+        elif dt == 'int':
+            self._write('{}: {}'.format(key, value))
+    def read(self, key, dt='mat'):
+        if dt == 'mat':
+            output = self.fs.getNode(key).mat()
+        elif dt == 'list':
+            results = []
+            n = self.fs.getNode(key)
+            for i in range(n.size()):
+                val = n.at(i).string()
+                if val == '':
+                    val = str(int(n.at(i).real()))
+                if val != 'none':
+                    results.append(val)
+            output = results
+        elif dt == 'int':
+            output = int(self.fs.getNode(key).real())
+        else:
+            raise NotImplementedError
+        return output
+    def close(self):
+        self.__del__(self)
+def read_intri(intri_name):
+    assert os.path.exists(intri_name), intri_name
+    intri = FileStorage(intri_name)
+    camnames = intri.read('names', dt='list')
+    cameras = {}
+    for key in camnames:
+        cam = {}
+        cam['K'] = intri.read('K_{}'.format(key))
+        cam['invK'] = np.linalg.inv(cam['K'])
+        cam['dist'] = intri.read('dist_{}'.format(key))
+        cameras[key] = cam
+    return cameras
+def write_intri(intri_name, cameras):
+    if not os.path.exists(os.path.dirname(intri_name)):
+        os.makedirs(os.path.dirname(intri_name))
+    intri = FileStorage(intri_name, True)
+    results = {}
+    camnames = list(cameras.keys())
+    intri.write('names', camnames, 'list')
+    for key_, val in cameras.items():
+        key = key_.split('.')[0]
+        K, dist = val['K'], val['dist']
+        assert K.shape == (3, 3), K.shape
+        assert dist.shape == (1, 5) or dist.shape == (5, 1) or dist.shape == (1, 4) or dist.shape == (4, 1), dist.shape
+        intri.write('K_{}'.format(key), K)
+        intri.write('dist_{}'.format(key), dist.flatten()[None])
+def write_extri(extri_name, cameras):
+    if not os.path.exists(os.path.dirname(extri_name)):
+        os.makedirs(os.path.dirname(extri_name))
+    extri = FileStorage(extri_name, True)
+    results = {}
+    camnames = list(cameras.keys())
+    extri.write('names', camnames, 'list')
+    for key_, val in cameras.items():
+        key = key_.split('.')[0]
+        extri.write('R_{}'.format(key), val['Rvec'])
+        extri.write('Rot_{}'.format(key), val['R'])
+        extri.write('T_{}'.format(key), val['T'])
+    return 0
+def read_camera(intri_name, extri_name, cam_names=[]):
+    assert os.path.exists(intri_name), intri_name
+    assert os.path.exists(extri_name), extri_name
+    intri = FileStorage(intri_name)
+    extri = FileStorage(extri_name)
+    cams, P = {}, {}
+    cam_names = intri.read('names', dt='list')
+    for cam in cam_names:
+        # 内参只读子码流的
+        cams[cam] = {}
+        cams[cam]['K'] = intri.read('K_{}'.format( cam))
+        cams[cam]['invK'] = np.linalg.inv(cams[cam]['K'])
+        H = intri.read('H_{}'.format(cam), dt='int')
+        W = intri.read('W_{}'.format(cam), dt='int')
+        if H is None or W is None:
+            print('[camera] no H or W for {}'.format(cam))
+            H, W = -1, -1
+        cams[cam]['H'] = H
+        cams[cam]['W'] = W
+        Rvec = extri.read('R_{}'.format(cam))
+        Tvec = extri.read('T_{}'.format(cam))
+        assert Rvec is not None, cam
+        R = cv2.Rodrigues(Rvec)[0]
+        RT = np.hstack((R, Tvec))
+        cams[cam]['RT'] = RT
+        cams[cam]['R'] = R
+        cams[cam]['Rvec'] = Rvec
+        cams[cam]['T'] = Tvec
+        cams[cam]['center'] = - Rvec.T @ Tvec
+        P[cam] = cams[cam]['K'] @ cams[cam]['RT']
+        cams[cam]['P'] = P[cam]
+        cams[cam]['dist'] = intri.read('dist_{}'.format(cam))
+        if cams[cam]['dist'] is None:
+            cams[cam]['dist'] = intri.read('D_{}'.format(cam))
+            if cams[cam]['dist'] is None:
+                print('[camera] no dist for {}'.format(cam))
+    cams['basenames'] = cam_names
+    return cams
+def read_camera_mvhumannet(intri_name, extri_name, camera_scale ,cam_names=[]):
+    assert os.path.exists(intri_name), intri_name
+    assert os.path.exists(extri_name), extri_name
+    import json
+    with open(intri_name, 'r') as f:
+        camera_intrinsics = json.load(f)
+    with open(extri_name, 'r') as f:
+        camera_extrinsics = json.load(f)
+    # print("intri: ", camera_intrinsics)
+    item = os.path.dirname(intri_name).split("/")[-1]
+    # print("item: ", item)
+    # intri = FileStorage(intri_name)
+    # extri = FileStorage(extri_name)
+    cams, P = {}, {}
+    # cam_names = intri.read('names', dt='list')
+    cam_names = camera_extrinsics.keys()
+    for cam in cam_names:
+        # 内参只读子码流的
+        updated_cam = cam.split('.')[0].split('_')
+        # print("updated_cam_before: ", updated_cam)
+        # updated_cam[1] = 'cache'   # for test
+        updated_cam = updated_cam[-1]
+        # print("updated_cam_after: ", updated_cam)
+        cams[updated_cam] = {}
+        # cams[updated_cam]['K'] = intri.read('K_{}'.format( cam))
+        cams[updated_cam]['K'] = np.array(camera_intrinsics['intrinsics'])
+        cams[updated_cam]['invK'] = np.linalg.inv(cams[updated_cam]['K'])
+        # import IPython; IPython.embed(); exit()
+        # Rvec = extri.read('R_{}'.format(cam))
+        # Tvec = extri.read('T_{}'.format(cam))
+        # assert Rvec is not None, cam
+        # R = cv2.Rodrigues(Rvec)[0]
+        R = np.array(camera_extrinsics[cam]['rotation'])
+        #  longgang
+        # Tvec = np.array(camera_extrinsics[cam]['translation'])[:, None] / 1000 * 100 / 65
+        # futian
+        Tvec = np.array(camera_extrinsics[cam]['translation'])[:, None] / 1000 * camera_scale
+        RT = np.hstack((R, Tvec))
+        cams[updated_cam]['RT'] = RT
+        cams[updated_cam]['R'] = R
+        # cams[updated_cam]['Rvec'] = Rvec
+        cams[updated_cam]['T'] = Tvec
+        # cams[updated_cam]['center'] = - Rvec.T @ Tvec
+        P[updated_cam] = cams[updated_cam]['K'] @ cams[updated_cam]['RT']
+        cams[updated_cam]['P'] = P[updated_cam]
+        # cams[updated_cam]['dist'] = np.array(camera_intrinsics['dist'])
+        cams[updated_cam]['dist'] = None   # dist for cv2.undistortPoint
+    # cams['basenames'] = cam_names
+    return cams
+def read_camera_ours(intri_name, extri_name, cam_names=[]):
+    assert os.path.exists(intri_name), intri_name
+    assert os.path.exists(extri_name), extri_name
+    import json
+    with open(intri_name, 'r') as f:
+        camera_intrinsics = json.load(f)
+    with open(extri_name, 'r') as f:
+        camera_extrinsics = json.load(f)
+    # print("intri: ", camera_intrinsics)
+    item = os.path.dirname(intri_name).split("/")[-1]
+    # print("item: ", item)
+    # intri = FileStorage(intri_name)
+    # extri = FileStorage(extri_name)
+    cams, P = {}, {}
+    # cam_names = intri.read('names', dt='list')
+    cam_names = camera_extrinsics.keys()
+    for cam in cam_names:
+        # 内参只读子码流的
+        updated_cam = cam.split('.')[0].split('_')
+        # print("updated_cam_before: ", updated_cam)
+        # updated_cam[1] = 'cache'   # for test
+        updated_cam = updated_cam[-1]
+        # print("updated_cam_after: ", updated_cam)
+        cams[updated_cam] = {}
+        # cams[updated_cam]['K'] = intri.read('K_{}'.format( cam))
+        cams[updated_cam]['K'] = np.array(camera_intrinsics['intrinsics'])
+        cams[updated_cam]['invK'] = np.linalg.inv(cams[updated_cam]['K'])
+        # import IPython; IPython.embed(); exit()
+        # Rvec = extri.read('R_{}'.format(cam))
+        # Tvec = extri.read('T_{}'.format(cam))
+        # assert Rvec is not None, cam
+        # R = cv2.Rodrigues(Rvec)[0]
+        R = np.array(camera_extrinsics[cam]['rotation'])
+        #  longgang
+        # Tvec = np.array(camera_extrinsics[cam]['translation'])[:, None] / 1000 * 100 / 65
+        # futian
+        Tvec = np.array(camera_extrinsics[cam]['translation'])[:, None] / 1000 * 120 / 65
+        RT = np.hstack((R, Tvec))
+        cams[updated_cam]['RT'] = RT
+        cams[updated_cam]['R'] = R
+        # cams[updated_cam]['Rvec'] = Rvec
+        cams[updated_cam]['T'] = Tvec
+        # cams[updated_cam]['center'] = - Rvec.T @ Tvec
+        P[updated_cam] = cams[updated_cam]['K'] @ cams[updated_cam]['RT']
+        cams[updated_cam]['P'] = P[updated_cam]
+        # cams[updated_cam]['dist'] = np.array(camera_intrinsics['dist'])
+        cams[updated_cam]['dist'] = None   # dist for cv2.undistortPoint
+    # cams['basenames'] = cam_names
+    return cams
+def read_cameras(path, intri='intri.yml', extri='extri.yml', subs=[]):
+    cameras = read_camera(join(path, intri), join(path, extri))
+    cameras.pop('basenames')
+    if len(subs) > 0:
+        cameras = {key:cameras[key].astype(np.float32) for key in subs}
+    return cameras
+def write_camera(camera, path):
+    from os.path import join
+    intri_name = join(path, 'intri.yml')
+    extri_name = join(path, 'extri.yml')
+    intri = FileStorage(intri_name, True)
+    extri = FileStorage(extri_name, True)
+    results = {}
+    camnames = [key_.split('.')[0] for key_ in camera.keys()]
+    intri.write('names', camnames, 'list')
+    extri.write('names', camnames, 'list')
+    for key_, val in camera.items():
+        if key_ == 'basenames':
+            continue
+        key = key_.split('.')[0]
+        intri.write('K_{}'.format(key), val['K'])
+        intri.write('dist_{}'.format(key), val['dist'])
+        if 'H' in val.keys() and 'W' in val.keys():
+            intri.write('H_{}'.format(key), val['H'], dt='int')
+            intri.write('W_{}'.format(key), val['W'], dt='int')
+        if 'Rvec' not in val.keys():
+            val['Rvec'] = cv2.Rodrigues(val['R'])[0]
+        extri.write('R_{}'.format(key), val['Rvec'])
+        extri.write('Rot_{}'.format(key), val['R'])
+        extri.write('T_{}'.format(key), val['T'])
+def camera_from_img(img):
+    height, width = img.shape[0], img.shape[1]
+    # focal = 1.2*max(height, width) # as colmap
+    focal = 1.2*min(height, width) # as colmap
+    K = np.array([focal, 0., width/2, 0., focal, height/2, 0. ,0., 1.]).reshape(3, 3)
+    camera = {'K':K ,'R': np.eye(3), 'T': np.zeros((3, 1)), 'dist': np.zeros((1, 5))}
+    camera['invK'] = np.linalg.inv(camera['K'])
+    camera['P'] = camera['K'] @ np.hstack((camera['R'], camera['T']))
+    return camera
+class Undistort:
+    distortMap = {}
+    @classmethod
+    def image(cls, frame, K, dist, sub=None, interp=cv2.INTER_NEAREST):
+        if sub is None:
+            return cv2.undistort(frame, K, dist, None)
+        else:
+            if sub not in cls.distortMap.keys():
+                h,  w = frame.shape[:2]
+                mapx, mapy = cv2.initUndistortRectifyMap(K, dist, None, K, (w,h), 5)
+                cls.distortMap[sub] = (mapx, mapy)
+            mapx, mapy = cls.distortMap[sub]
+            img = cv2.remap(frame, mapx, mapy, interp)
+            return img
+    @staticmethod
+    def points(keypoints, K, dist):
+        # keypoints: (N, 3)
+        assert len(keypoints.shape) == 2, keypoints.shape
+        kpts = keypoints[:, None, :2]
+        kpts = np.ascontiguousarray(kpts)
+        kpts = cv2.undistortPoints(kpts, K, dist, P=K)
+        keypoints = np.hstack([kpts[:, 0], keypoints[:, 2:]])
+        return keypoints
+    @staticmethod
+    def bbox(bbox, K, dist):
+        keypoints = np.array([[bbox[0], bbox[1], 1], [bbox[2], bbox[3], 1]])
+        kpts = Undistort.points(keypoints, K, dist)
+        bbox = np.array([kpts[0, 0], kpts[0, 1], kpts[1, 0], kpts[1, 1], bbox[4]])
+        return bbox
+class Distort:
+    @staticmethod
+    def points(keypoints, K, dist):
+        pass
+    @staticmethod
+    def bbox(bbox, K, dist):
+        keypoints = np.array([[bbox[0], bbox[1]], [bbox[2], bbox[3]]], dtype=np.float32)
+        k3d = cv2.convertPointsToHomogeneous(keypoints)
+        k3d = (np.linalg.inv(K) @ k3d[:, 0].T).T[:, None]
+        k2d, _ = cv2.projectPoints(k3d, np.zeros((3,)), np.zeros((3,)), K, dist)
+        k2d = k2d[:, 0]
+        bbox = np.array([k2d[0,0], k2d[0,1], k2d[1, 0], k2d[1, 1], bbox[-1]])
+        return bbox
+def unproj(kpts, invK):
+    homo = np.hstack([kpts[:, :2], np.ones_like(kpts[:, :1])])
+    homo = homo @ invK.T
+    return np.hstack([homo[:, :2], kpts[:, 2:]])
+class UndistortFisheye:
+    @staticmethod
+    def image(frame, K, dist):
+        Knew = K.copy()
+        frame = cv2.fisheye.undistortImage(frame, K, dist, Knew=Knew)
+        return frame, Knew
+    @staticmethod
+    def points(keypoints, K, dist, Knew):
+        # keypoints: (N, 3)
+        assert len(keypoints.shape) == 2, keypoints.shape
+        kpts = keypoints[:, None, :2]
+        kpts = np.ascontiguousarray(kpts)
+        kpts = cv2.fisheye.undistortPoints(kpts, K, dist, P=Knew)
+        keypoints = np.hstack([kpts[:, 0], keypoints[:, 2:]])
+        return keypoints
+    @staticmethod
+    def bbox(bbox, K, dist, Knew):
+        keypoints = np.array([[bbox[0], bbox[1], 1], [bbox[2], bbox[3], 1]])
+        kpts = UndistortFisheye.points(keypoints, K, dist, Knew)
+        bbox = np.array([kpts[0, 0], kpts[0, 1], kpts[1, 0], kpts[1, 1], bbox[4]])
+        return bbox
+def get_Pall(cameras, camnames):
+    Pall = np.stack([cameras[cam]['K'] @ np.hstack((cameras[cam]['R'], cameras[cam]['T'])) for cam in camnames])
+    return Pall
+def get_fundamental_matrix(cameras, basenames):
+    skew_op = lambda x: np.array([[0, -x[2], x[1]], [x[2], 0, -x[0]], [-x[1], x[0], 0]])
+    fundamental_op = lambda K_0, R_0, T_0, K_1, R_1, T_1: np.linalg.inv(K_0).T @ (
+            R_0 @ R_1.T) @ K_1.T @ skew_op(K_1 @ R_1 @ R_0.T @ (T_0 - R_0 @ R_1.T @ T_1))
+    fundamental_RT_op = lambda K_0, RT_0, K_1, RT_1: fundamental_op (K_0, RT_0[:, :3], RT_0[:, 3], K_1,
+                                                                          RT_1[:, :3], RT_1[:, 3] )
+    F = np.zeros((len(basenames), len(basenames), 3, 3))  # N x N x 3 x 3 matrix
+    F = {(icam, jcam): np.zeros((3, 3)) for jcam in basenames for icam in basenames}
+    for icam in basenames:
+        for jcam in basenames:
+            F[(icam, jcam)] += fundamental_RT_op(cameras[icam]['K'], cameras[icam]['RT'], cameras[jcam]['K'], cameras[jcam]['RT'])
+            if F[(icam, jcam)].sum() == 0:
+                F[(icam, jcam)] += 1e-12  # to avoid nan
+    return F
+def interp_cameras(cameras, keys, step=20, loop=True, allstep=-1, **kwargs):
+    from scipy.spatial.transform import Rotation as R
+    from scipy.spatial.transform import Slerp
+    if allstep != -1:
+        tall = np.linspace(0., 1., allstep+1)[:-1].reshape(-1, 1, 1)
+    elif allstep == -1 and loop:
+        tall = np.linspace(0., 1., 1+step*len(keys))[:-1].reshape(-1, 1, 1)
+    elif allstep == -1 and not loop:
+        tall = np.linspace(0., 1., 1+step*(len(keys)-1))[:-1].reshape(-1, 1, 1)
+    cameras_new = {}
+    for ik in range(len(keys)):
+        if ik == len(keys) -1 and not loop:
+            break
+        if loop:
+            start, end = (ik * tall.shape[0])//len(keys),     int((ik+1)*tall.shape[0])//len(keys)
+            print(ik, start, end, tall.shape)
+        else:
+            start, end = (ik * tall.shape[0])//(len(keys)-1), int((ik+1)*tall.shape[0])//(len(keys)-1)
+        t = tall[start:end].copy()
+        t = (t-t.min())/(t.max()-t.min())
+        left, right = keys[ik], keys[0 if ik == len(keys)-1 else ik + 1]
+        camera_left = cameras[left]
+        camera_right = cameras[right]
+        # 插值相机中心: center = - R.T @ T
+        center_l = - camera_left['R'].T @ camera_left['T']
+        center_r = - camera_right['R'].T @ camera_right['T']
+        center_l, center_r = center_l[None], center_r[None]
+        if False:
+            centers = center_l * (1-t) + center_r * t
+        else:
+            # 球面插值
+            norm_l, norm_r = np.linalg.norm(center_l), np.linalg.norm(center_r)
+            center_l, center_r = center_l/norm_l, center_r/norm_r
+            costheta = (center_l*center_r).sum()
+            sintheta = np.sqrt(1. - costheta**2)
+            theta = np.arctan2(sintheta, costheta)
+            centers = (np.sin(theta*(1-t)) * center_l + np.sin(theta * t) * center_r)/sintheta
+            norm = norm_l * (1-t) + norm_r * t
+            centers = centers * norm
+        key_rots = R.from_matrix(np.stack([camera_left['R'], camera_right['R']]))
+        key_times = [0, 1]
+        slerp = Slerp(key_times, key_rots)
+        interp_rots = slerp(t.squeeze()).as_matrix()
+        # 计算相机T RX + T = 0 => T = - R @ X
+        T = - np.einsum('bmn,bno->bmo', interp_rots, centers)
+        K = camera_left['K'] * (1-t) + camera_right['K'] * t
+        for i in range(T.shape[0]):
+            cameras_new['{}-{}-{}'.format(left, right, i)] = \
+                {
+                    'K': K[i],
+                    'dist': np.zeros((1, 5)),
+                    'R': interp_rots[i],
+                    'T': T[i]
+                }
+    return cameras_new

src/multiview_consist_edit/infer_tryon_multi.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import PIL
+from PIL import Image
+import requests
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import os
+import random
+import copy
+import time
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionInstructPix2PixPipeline, DDIMScheduler
+from torchvision.utils import make_grid as make_image_grid
+from torchvision.utils import save_image
+from models.condition_encoder import FrozenOpenCLIPImageEmbedderV2
+from omegaconf import OmegaConf
+from pipelines.pipeline_tryon_multi import TryOnPipeline
+from models.hack_poseguider import Hack_PoseGuider as PoseGuider
+from models.ReferenceNet import ReferenceNet
+from models.ReferenceEncoder import ReferenceEncoder
+from data.Thuman2_multi import Thuman2_Dataset, collate_fn
+# from data.Thuman2_multi_ps2 import Thuman2_Dataset, collate_fn
+from data.MVHumanNet_multi import MVHumanNet_Dataset
+from models.hack_unet2d import Hack_UNet2DConditionModel as UNet2DConditionModel
+config = OmegaConf.load('config/infer_tryon_multi.yaml')
+def main():
+    # seed
+    seed = config.seed
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    # dataset
+    infer_data_config = config.infer_data
+    if 'mvhumannet' in infer_data_config['dataroot']:
+        infer_dataset = MVHumanNet_Dataset(**infer_data_config)
+        print('using mvhumannet')
+    else:
+        infer_dataset = Thuman2_Dataset(**infer_data_config)
+        print('using Thuman2_Dataset')
+    batch_size = config.batch_size
+    # multi_length = 16
+    test_dataloader = torch.utils.data.DataLoader(
+        infer_dataset,
+        shuffle=False,
+        collate_fn=collate_fn,
+        batch_size=config.batch_size,
+        num_workers=config.dataloader_num_workers,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        config.unet_path, subfolder="unet",torch_dtype=torch.float16
+    ).to("cuda")
+    # unet = UNet2DConditionModel.from_pretrained(
+    # config.unet_path, subfolder=None,torch_dtype=torch.float16
+    # ).to("cuda")
+    vae= AutoencoderKL.from_pretrained(
+        config.vae_path,torch_dtype=torch.float16
+    ).to("cuda")
+    referencenet = ReferenceNet.from_pretrained(
+        config.pretrained_referencenet_path, subfolder="referencenet",torch_dtype=torch.float16
+    ).to("cuda")
+    # referencenet = ReferenceNet.load_referencenet(pretrained_model_path=config.pretrained_referencenet_path).to("cuda", dtype=torch.float16)
+    pose_guider = PoseGuider.from_pretrained(pretrained_model_path=config.pretrained_poseguider_path).to("cuda", dtype=torch.float16)
+    pose_guider.eval()
+    scheduler = DDIMScheduler.from_pretrained(config.model_path, subfolder='scheduler')
+    pipe = TryOnPipeline(pose_guider=pose_guider, referencenet=referencenet, vae=vae, unet=unet, scheduler=scheduler)
+    pipe.enable_xformers_memory_efficient_attention()
+    # pipe._execution_device = torch.device("cuda")
+    # pipe.to("cuda")
+    clip_image_encoder = ReferenceEncoder(model_path=config.clip_model_path).to(device='cuda',dtype=torch.float16)
+    pipe.scheduler = DDIMScheduler(
+        beta_start=0.00085,
+        beta_end=0.012,
+        beta_schedule="scaled_linear",
+        clip_sample=False,
+        set_alpha_to_one=False,
+        )
+    generator = torch.Generator("cuda").manual_seed(seed)
+    # infer
+    out_dir = config.out_dir
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    num_inference_steps = config.num_inference_steps
+    guidance_scale = config.guidance_scale
+    weight_dtype = torch.float16
+    # # check vae reconstruction
+    # image_idx = 0
+    # for i, batch in enumerate(test_dataloader):
+    #     video = batch['pixel_values'].to(device='cuda', dtype=torch.float16)
+    #     out = video[0].cpu() /2 +0.5
+    #     out = out.detach().permute(1,2,0).numpy()
+    #     out = (out * 255).astype(np.uint8)
+    #     out = Image.fromarray(out)
+    #     out.save('%d_test_ori.png' % i)
+    #     latents = vae.encode(video)
+    #     latents = latents.latent_dist.sample()
+    #     reconstruct_video = vae.decode(latents).sample
+    #     reconstruct_video = reconstruct_video.clamp(-1, 1)
+    #     out = reconstruct_video[0].cpu() /2 +0.5
+    #     out = out.detach().permute(1,2,0).numpy()
+    #     out = (out * 255).astype(np.uint8)
+    #     out = Image.fromarray(out)
+    #     out.save('%d_test2.png' % i)
+    image_idx = 0
+    for i, batch in enumerate(test_dataloader):
+        pixel_values = batch["pixel_values"]
+        pixel_values_pose = batch["pixel_values_pose"].to(device='cuda')
+        pixel_values_agnostic = batch["pixel_values_agnostic"].to(device='cuda')
+        clip_ref_front = batch["clip_ref_front"].to(device='cuda')
+        clip_ref_back = batch["clip_ref_back"].to(device='cuda')
+        pixel_values_ref_front = batch["pixel_values_ref_front"].to(device='cuda')
+        pixel_values_ref_back = batch["pixel_values_ref_back"].to(device='cuda')
+        camera_pose = batch["camera_parm"]
+        front_dino_fea = clip_image_encoder(clip_ref_front.to(weight_dtype))
+        back_dino_fea = clip_image_encoder(clip_ref_back.to(weight_dtype))
+        img_name = batch["img_name"]
+        cloth_name = batch["cloth_name"]
+        multi_length = pixel_values.shape[1]
+        # dino_fea = dino_fea.unsqueeze(1)
+        # print(dino_fea.shape) # [bs,1,768]
+        print(img_name)
+        edited_images = pipe(
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            front_image=pixel_values_ref_front.to(weight_dtype),
+            back_image=pixel_values_ref_back.to(weight_dtype),
+            pose_image=pixel_values_pose.to(weight_dtype),
+            # camera_pose=camera_pose.to(weight_dtype),
+            camera_pose=camera_pose,
+            agnostic_image=pixel_values_agnostic.to(weight_dtype),
+            generator=generator,
+            front_dino_fea = front_dino_fea,
+            back_dino_fea = back_dino_fea,
+        ).images
+        # print('check3', pixel_values.shape, pixel_values_pose.shape, pixel_values_agnostic.shape, pixel_values_ref_front.shape, pixel_values_ref_back.shape)
+        for batch_idx in range(config.batch_size):
+            for image_idx in range(multi_length):
+                total_idx = batch_idx*multi_length + image_idx
+                edited_image = edited_images[total_idx]
+                edited_image = torch.tensor(np.array(edited_image)).permute(2,0,1) / 255.0
+                grid = make_image_grid([(pixel_values[batch_idx][image_idx].cpu() / 2 + 0.5),edited_image.cpu(), (pixel_values_pose[batch_idx][image_idx].cpu() / 2 + 0.5),
+                    (pixel_values_agnostic[batch_idx][image_idx].cpu() / 2 + 0.5), (pixel_values_ref_front[batch_idx].cpu() / 2 + 0.5),(pixel_values_ref_back[batch_idx].cpu() / 2 + 0.5)], nrow=2)
+                # save_image(grid, os.path.join(out_dir, ('%d.jpg'%image_idx).zfill(6)))
+                # os.makedirs(os.path.join(out_dir, sample_name[idx].split("_")[0]), exist_ok=True)
+                # save_image(edited_image, os.path.join(out_dir, img_name[idx][:-4]+'_'+cloth_name[idx]))
+                img_name[total_idx] = img_name[total_idx].replace('/','_')
+                cloth_name[batch_idx] = cloth_name[batch_idx].split('/')[-1].split('_')[0]
+                print(img_name[total_idx], cloth_name[batch_idx])
+                sub_cloth_root = os.path.join(out_dir, cloth_name[batch_idx])
+                if not os.path.exists(sub_cloth_root):
+                    os.makedirs(sub_cloth_root)
+                save_image(edited_image, os.path.join(out_dir, cloth_name[batch_idx], img_name[total_idx]))
+                save_image(grid, os.path.join(out_dir, cloth_name[batch_idx], 'cond_'+img_name[total_idx]))
+                print(out_dir, cloth_name[batch_idx], img_name[total_idx])
+                image_idx +=1
+if __name__ == "__main__":
+    main()

src/multiview_consist_edit/models/ReferenceEncoder.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import torch
+import torch.nn as nn
+from PIL import Image
+from transformers import CLIPProcessor, CLIPVisionModel, CLIPImageProcessor
+from transformers import logging
+logging.set_verbosity_warning()
+logging.set_verbosity_error()
+# https://github.com/tencent-ailab/IP-Adapter/blob/main/tutorial_train_plus.py#L49
+class ReferenceEncoder(nn.Module):
+    def __init__(self, model_path="openai/clip-vit-base-patch32"):
+        super(ReferenceEncoder, self).__init__()
+        self.model = CLIPVisionModel.from_pretrained(model_path,local_files_only=False)
+        self.freeze()
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
+    def forward(self, pixel_values):
+        outputs = self.model(pixel_values)
+        last_hidden_state = outputs.last_hidden_state
+        return last_hidden_state
+        # pooled_output = outputs.pooler_output
+        # return pooled_output
+class ReferenceEncoder2(nn.Module):
+    def __init__(self, model_path="openai/clip-vit-base-patch32"):
+        super(ReferenceEncoder2, self).__init__()
+        self.model = CLIPVisionModel.from_pretrained(model_path,local_files_only=True)
+        self.processor = CLIPProcessor.from_pretrained(model_path,local_files_only=True)
+        self.freeze()
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
+    def forward(self, image):
+        inputs = self.processor(images=image, return_tensors="pt")
+        print(inputs['pixel_values'].size())
+        outputs = self.model(**inputs)
+        print(outputs['last_hidden_state'].shape)
+        print(outputs.keys())
+        pooled_output = outputs.pooler_output
+        return pooled_output
+# # example
+# model = ReferenceEncoder2(model_path='/root/autodl-tmp/Open-AnimateAnyone/pretrained_models/clip-vit-base-patch32')
+# image_path = "../test.png"
+# # image_path = "/mnt/f/research/HumanVideo/AnimateAnyone-unofficial/DWPose/0001.png"
+# image = Image.open(image_path).convert('RGB')
+# image = [image,image]
+# pooled_output = model(image)
+# print(f"Pooled Output Size: {pooled_output.size()}") # Pooled Output Size: torch.Size([bs, 768])

src/multiview_consist_edit/models/ReferenceNet.py ADDED Viewed

	@@ -0,0 +1,1146 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import os
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from diffusers.models.lora import LoRALinearLayer
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    PositionNet,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unet_2d_blocks import (
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    get_down_block,
+    get_up_block,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class Identity(torch.nn.Module):
+    r"""A placeholder identity operator that is argument-insensitive.
+    Args:
+        args: any argument (unused)
+        kwargs: any keyword argument (unused)
+    Shape:
+        - Input: :math:`(*)`, where :math:`*` means any number of dimensions.
+        - Output: :math:`(*)`, same shape as the input.
+    Examples::
+        >>> m = nn.Identity(54, unused_argument1=0.1, unused_argument2=False)
+        >>> input = torch.randn(128, 20)
+        >>> output = m(input)
+        >>> print(output.size())
+        torch.Size([128, 20])
+    """
+    def __init__(self, scale=None, *args, **kwargs) -> None:
+        super(Identity, self).__init__()
+    def forward(self, input, *args, **kwargs):
+        return input
+class _LoRACompatibleLinear(nn.Module):
+    """
+    A Linear layer that can be used with LoRA.
+    """
+    def __init__(self, *args, lora_layer: Optional[LoRALinearLayer] = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.lora_layer = lora_layer
+    def set_lora_layer(self, lora_layer: Optional[LoRALinearLayer]):
+        self.lora_layer = lora_layer
+    def _fuse_lora(self):
+        pass
+    def _unfuse_lora(self):
+        pass
+    def forward(self, hidden_states, scale=None, lora_scale: int = 1):
+        return hidden_states
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor = None
+class ReferenceNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # # out
+        # if norm_num_groups is not None:
+        #     self.conv_norm_out = nn.GroupNorm(
+        #         num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+        #     )
+        #     self.conv_act = get_activation(act_fn)
+        # else:
+        #     self.conv_norm_out = None
+        #     self.conv_act = None
+        # conv_out_padding = (conv_out_kernel - 1) // 2
+        # self.conv_out = nn.Conv2d(
+        #     block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        # )
+        # Diff vs diffusers-0.21.4/src/diffusers/models/unet_2d_condition.py
+        # skip last cross attention for slight acceleration and for DDP training
+        # The following parameters (cross-attention for the last layer)
+        # and conv_out are not involved in the gradient calculation of the model
+        self.up_blocks[3].attentions[2].transformer_blocks[0].attn1.to_q = _LoRACompatibleLinear()
+        self.up_blocks[3].attentions[2].transformer_blocks[0].attn1.to_k = _LoRACompatibleLinear()
+        self.up_blocks[3].attentions[2].transformer_blocks[0].attn1.to_v = _LoRACompatibleLinear()
+        self.up_blocks[3].attentions[2].transformer_blocks[0].attn1.to_out = nn.ModuleList([Identity(), Identity()])
+        self.up_blocks[3].attentions[2].transformer_blocks[0].norm2 = Identity()
+        self.up_blocks[3].attentions[2].transformer_blocks[0].attn2 = None
+        self.up_blocks[3].attentions[2].transformer_blocks[0].norm3 = Identity()
+        self.up_blocks[3].attentions[2].transformer_blocks[0].ff = Identity()
+        self.up_blocks[3].attentions[2].proj_out = Identity()
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+                positive_len = cross_attention_dim[0]
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = PositionNet(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+        self.set_attn_processor(processor)
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        # 3. down
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    sample += down_block_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_block_additional_residuals) > 0
+                and sample.shape == down_block_additional_residuals[0].shape
+            ):
+                sample += down_block_additional_residuals.pop(0)
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size
+                )
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)
+    @classmethod
+    def load_referencenet(cls, pretrained_model_path):
+        print(f"loaded ReferenceNet's pretrained weights from {pretrained_model_path} ...")
+        config = {
+            "_class_name": "UNet2DConditionModel",
+            "_diffusers_version": "0.6.0",
+            "act_fn": "silu",
+            "attention_head_dim": 8,
+            "block_out_channels": [320, 640, 1280, 1280],
+            "center_input_sample": False,
+            "cross_attention_dim": 768,
+            "down_block_types": [
+                "CrossAttnDownBlock2D",
+                "CrossAttnDownBlock2D",
+                "CrossAttnDownBlock2D",
+                "DownBlock2D"
+            ],
+            "downsample_padding": 1,
+            "flip_sin_to_cos": True,
+            "freq_shift": 0,
+            "in_channels": 4,
+            "layers_per_block": 2,
+            "mid_block_scale_factor": 1,
+            "norm_eps": 1e-05,
+            "norm_num_groups": 32,
+            "out_channels": 4,
+            "sample_size": 64,
+            "up_block_types": [
+                "UpBlock2D",
+                "CrossAttnUpBlock2D",
+                "CrossAttnUpBlock2D",
+                "CrossAttnUpBlock2D"
+            ]
+        }
+        # from diffusers.utils import WEIGHTS_NAME
+        model = cls.from_config(config)
+        # model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
+        model_file = pretrained_model_path
+        if not os.path.isfile(model_file):
+            raise RuntimeError(f"{model_file} does not exist")
+        state_dict = torch.load(model_file, map_location="cpu")
+        m, u = model.load_state_dict(state_dict, strict=False)
+        if m or u:
+            print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+            print(f"### missing keys:\n{m}\n### unexpected keys:\n{u}\n")
+        # params = [p.numel() for n, p in model.named_parameters() if "2D" in n]
+        # print(f"### 2D Module Parameters: {sum(params) / 1e6} M")
+        params = [p.numel() for n, p in model.named_parameters()]
+        print(f"### Module Parameters: {sum(params) / 1e6} M")
+        return model

src/multiview_consist_edit/models/ReferenceNet_attention_multi_fp16.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# Adapted from https://github.com/magic-research/magic-animate/blob/main/magicanimate/models/mutual_self_attention.py
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from diffusers.models.attention import BasicTransformerBlock
+from .attention import BasicTransformerBlock as _BasicTransformerBlock
+def torch_dfs(model: torch.nn.Module):
+    result = [model]
+    for child in model.children():
+        result += torch_dfs(child)
+    return result
+class ReferenceNetAttention():
+    def __init__(self,
+                 unet,
+                 mode="write",
+                 do_classifier_free_guidance=False,
+                 attention_auto_machine_weight = float('inf'),
+                 gn_auto_machine_weight = 1.0,
+                 style_fidelity = 1.0,
+                 reference_attn=True,
+                 fusion_blocks="full",
+                 batch_size=1,
+                 is_image=False,
+                 ) -> None:
+        # 10. Modify self attention and group norm
+        self.unet = unet
+        assert mode in ["read", "write"]
+        assert fusion_blocks in ["midup", "full"]
+        self.reference_attn = reference_attn
+        self.fusion_blocks = fusion_blocks
+        self.register_reference_hooks(
+            mode,
+            do_classifier_free_guidance,
+            attention_auto_machine_weight,
+            gn_auto_machine_weight,
+            style_fidelity,
+            reference_attn,
+            fusion_blocks,
+            batch_size=batch_size,
+            is_image=is_image,
+        )
+    def register_reference_hooks(
+            self,
+            mode,
+            do_classifier_free_guidance,
+            attention_auto_machine_weight,
+            gn_auto_machine_weight,
+            style_fidelity,
+            reference_attn,
+            # dtype=torch.float16,
+            dtype=torch.float16,
+            batch_size=1,
+            num_images_per_prompt=1,
+            device=torch.device("cpu"),
+            fusion_blocks='midup',
+            is_image=False,
+        ):
+        MODE = mode
+        do_classifier_free_guidance = do_classifier_free_guidance
+        attention_auto_machine_weight = attention_auto_machine_weight
+        gn_auto_machine_weight = gn_auto_machine_weight
+        style_fidelity = style_fidelity
+        reference_attn = reference_attn
+        fusion_blocks = fusion_blocks
+        num_images_per_prompt = num_images_per_prompt
+        dtype=dtype
+        def fully_self_attn(self, hidden_states, norm_hidden_states, attention_mask, garment_fea_attn=True):
+            b = self.bank[0].shape[0]  # 因为衣服没有经过rearrage,不需要将b和f合成bf
+            p,l,c = norm_hidden_states.shape
+            f = p//b
+            norm_hidden_states = rearrange(norm_hidden_states, "(b f) l c -> b (f l) c",b=b)
+            # add front view and back view feature
+            if garment_fea_attn:
+                # self.bank[0] = self.bank[0][0].unsqueeze(0)
+                # self.bank[1] = self.bank[1][0].unsqueeze(0)
+                # print('check2', norm_hidden_states.shape, self.bank[0].shape)
+                modify_norm_hidden_states = torch.cat([norm_hidden_states] + self.bank, dim=1)
+            else:
+                modify_norm_hidden_states = norm_hidden_states
+            hidden_states_uc = self.attn1(modify_norm_hidden_states,
+                                        encoder_hidden_states=modify_norm_hidden_states,
+                                        attention_mask=attention_mask,garment_fea_attn=garment_fea_attn)
+            hidden_states_uc = hidden_states_uc[:, :(f*l), :]
+            hidden_states_uc = rearrange(hidden_states_uc, "b (f l) c -> (b f) l c", b=b, f=f)
+            hidden_states_uc = hidden_states_uc + hidden_states
+            return hidden_states_uc
+        def hacked_basic_transformer_inner_forward(
+            self,
+            hidden_states: torch.FloatTensor,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+            timestep: Optional[torch.LongTensor] = None,
+            cross_attention_kwargs: Dict[str, Any] = None,
+            class_labels: Optional[torch.LongTensor] = None,
+            video_length=None,
+        ):
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm1(hidden_states, timestep)
+            elif self.use_ada_layer_norm_zero:
+                norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                    hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+            # 1. Self-Attention
+            cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+            if self.only_cross_attention:
+                attn_output = self.attn1(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                    attention_mask=attention_mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                if MODE == "write":
+                    self.bank.append(norm_hidden_states.clone())
+                    attn_output = self.attn1(
+                        norm_hidden_states,
+                        encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                        attention_mask=attention_mask,
+                        **cross_attention_kwargs,
+                    )
+                if MODE == "read":
+                    if not is_image:
+                        self.bank = [rearrange(d.unsqueeze(1).repeat(1, video_length, 1, 1), "b t l c -> (b t) l c")[:hidden_states.shape[0]] for d in self.bank]
+                    # revise here
+                    if True: # 这里一定是True, 如果是false用图像级别的代码就好
+                        if do_classifier_free_guidance:
+                            _uc_mask_top = (
+                                torch.Tensor([1] * (hidden_states.shape[0]//2) + [0] * (hidden_states.shape[0]//2))
+                                .to(device)
+                                .bool()
+                            )
+                            _uc_mask_bottom = (
+                                torch.Tensor([0] * (hidden_states.shape[0]//2) + [1] * (hidden_states.shape[0]//2))
+                                .to(device)
+                                .bool()
+                            )
+                            # 前面一半是uncond, 后面一半是cond
+                            hidden_states_uc = norm_hidden_states.clone()
+                            hidden_states_uc[_uc_mask_top] = fully_self_attn(self, hidden_states[_uc_mask_top], norm_hidden_states[_uc_mask_top], attention_mask, garment_fea_attn=False)
+                            hidden_states_uc[_uc_mask_bottom] = fully_self_attn(self, hidden_states[_uc_mask_bottom], norm_hidden_states[_uc_mask_bottom], attention_mask, garment_fea_attn=True)
+                            hidden_states = hidden_states_uc.clone()
+                        else:
+                            hidden_states_uc = fully_self_attn(self, hidden_states, norm_hidden_states, attention_mask, garment_fea_attn=True)
+                            hidden_states = hidden_states_uc.clone()
+                    else:
+                        # modify Reference Sec 3.2.2
+                        modify_norm_hidden_states = torch.cat([norm_hidden_states] + self.bank, dim=1)
+                        hidden_states_uc = self.attn1(modify_norm_hidden_states,
+                                                    encoder_hidden_states=modify_norm_hidden_states,
+                                                    attention_mask=attention_mask)[:,:hidden_states.shape[-2],:] + hidden_states
+                        hidden_states_c = hidden_states_uc.clone()
+                        _uc_mask = uc_mask.clone()
+                        if do_classifier_free_guidance:
+                            if hidden_states.shape[0] != _uc_mask.shape[0]:
+                                _uc_mask = (
+                                    torch.Tensor([1] * (hidden_states.shape[0]//2) + [0] * (hidden_states.shape[0]//2))
+                                    .to(device)
+                                    .bool()
+                                )
+                            # print('111111', _uc_mask.shape, norm_hidden_states.shape)
+                            hidden_states_c[_uc_mask] = self.attn1(
+                                norm_hidden_states[_uc_mask],
+                                encoder_hidden_states=norm_hidden_states[_uc_mask],
+                                attention_mask=attention_mask,
+                            ) + hidden_states[_uc_mask]
+                        hidden_states = hidden_states_c.clone()
+                        # self.bank.clear()
+                    if self.attn2 is not None:
+                        # Cross-Attention
+                        norm_hidden_states = (
+                            self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                        )
+                        hidden_states = (
+                            self.attn2(
+                                norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+                            )
+                            + hidden_states
+                        )
+                    # Feed-forward
+                    hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+                    # Temporal-Attention
+                    if not is_image:
+                        if self.unet_use_temporal_attention:
+                            d = hidden_states.shape[1]
+                            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+                            norm_hidden_states = (
+                                self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)
+                            )
+                            hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
+                            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+                    return hidden_states
+            if self.use_ada_layer_norm_zero:
+                attn_output = gate_msa.unsqueeze(1) * attn_output
+            hidden_states = attn_output + hidden_states
+            if self.attn2 is not None:
+                norm_hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+                # 2. Cross-Attention
+                attn_output = self.attn2(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=encoder_attention_mask,
+                    **cross_attention_kwargs,
+                )
+                hidden_states = attn_output + hidden_states
+            # 3. Feed-forward
+            norm_hidden_states = self.norm3(hidden_states)
+            if self.use_ada_layer_norm_zero:
+                norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+            ff_output = self.ff(norm_hidden_states)
+            if self.use_ada_layer_norm_zero:
+                ff_output = gate_mlp.unsqueeze(1) * ff_output
+            hidden_states = ff_output + hidden_states
+            return hidden_states
+        if self.reference_attn:
+            if self.fusion_blocks == "midup":
+                attn_modules = [module for module in (torch_dfs(self.unet.mid_block)+torch_dfs(self.unet.up_blocks)) if isinstance(module, BasicTransformerBlock) or isinstance(module, _BasicTransformerBlock)]
+            elif self.fusion_blocks == "full":
+                attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, BasicTransformerBlock) or isinstance(module, _BasicTransformerBlock)]
+            attn_modules = sorted(attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+            for i, module in enumerate(attn_modules):
+                module._original_inner_forward = module.forward
+                module.forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock)
+                module.bank = []
+                module.attn_weight = float(i) / float(len(attn_modules))
+    # def update(self, writer, dtype=torch.float16):
+    def update(self, writer, dtype=torch.float16):
+        if self.reference_attn:
+            if self.fusion_blocks == "midup":
+                reader_attn_modules = [module for module in (torch_dfs(self.unet.mid_block)+torch_dfs(self.unet.up_blocks)) if isinstance(module, _BasicTransformerBlock)]
+                writer_attn_modules = [module for module in (torch_dfs(writer.unet.mid_block)+torch_dfs(writer.unet.up_blocks)) if isinstance(module, BasicTransformerBlock)]
+            elif self.fusion_blocks == "full":
+                reader_attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, _BasicTransformerBlock) or isinstance(module, BasicTransformerBlock)]
+                writer_attn_modules = [module for module in torch_dfs(writer.unet) if isinstance(module, _BasicTransformerBlock) or isinstance(module, BasicTransformerBlock)]
+            reader_attn_modules = sorted(reader_attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+            writer_attn_modules = sorted(writer_attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+            if len(reader_attn_modules) == 0:
+                print('reader_attn_modules is null')
+                assert False
+            if len(writer_attn_modules) == 0:
+                print('writer_attn_modules is null')
+                assert False
+            for r, w in zip(reader_attn_modules, writer_attn_modules):
+                r.bank = [v.clone().to(dtype) for v in w.bank]
+                # w.bank.clear()
+    def clear(self):
+        if self.reference_attn:
+            if self.fusion_blocks == "midup":
+                reader_attn_modules = [module for module in (torch_dfs(self.unet.mid_block)+torch_dfs(self.unet.up_blocks)) if isinstance(module, BasicTransformerBlock) or isinstance(module, _BasicTransformerBlock)]
+            elif self.fusion_blocks == "full":
+                reader_attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, BasicTransformerBlock) or isinstance(module, _BasicTransformerBlock)]
+            reader_attn_modules = sorted(reader_attn_modules, key=lambda x: -x.norm1.normalized_shape[0])
+            for r in reader_attn_modules:
+                r.bank.clear()

src/multiview_consist_edit/models/attention.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# *************************************************************************
+# This file may have been modified by Bytedance Inc. (“Bytedance Inc.'s Mo-
+# difications”). All Bytedance Inc.'s Modifications are Copyright (2023) B-
+# ytedance Inc..
+# *************************************************************************
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import FeedForward, AdaLayerNorm
+from diffusers.models.attention import Attention as CrossAttention
+from einops import rearrange, repeat
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class Transformer3DModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        unet_use_cross_frame_attention=None,
+        unet_use_temporal_attention=None,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        if use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        # Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                    unet_use_temporal_attention=unet_use_temporal_attention,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        if use_linear_projection:
+            self.proj_out = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
+        # Input
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length = hidden_states.shape[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        # JH: need not repeat when a list of prompts are given
+        if encoder_hidden_states.shape[0] != hidden_states.shape[0]:
+            encoder_hidden_states = repeat(encoder_hidden_states, 'b n c -> (b f) n c', f=video_length)
+        batch, channel, height, weight = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            hidden_states = self.proj_in(hidden_states)
+        # Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                video_length=video_length
+            )
+        # Output
+        if not self.use_linear_projection:
+            hidden_states = (
+                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = (
+                hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        if not return_dict:
+            return (output,)
+        return Transformer3DModelOutput(sample=output)
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        unet_use_cross_frame_attention = None,
+        unet_use_temporal_attention = None,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        self.unet_use_cross_frame_attention = unet_use_cross_frame_attention
+        self.unet_use_temporal_attention = unet_use_temporal_attention
+        # SC-Attn
+        assert unet_use_cross_frame_attention is not None
+        if unet_use_cross_frame_attention:
+            self.attn1 = SparseCausalAttention2D(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+                upcast_attention=upcast_attention,
+            )
+        else:
+            self.attn1 = CrossAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+        self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        # Cross-Attn
+        if cross_attention_dim is not None:
+            self.attn2 = CrossAttention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+        else:
+            self.attn2 = None
+        if cross_attention_dim is not None:
+            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        else:
+            self.norm2 = None
+        # Feed-forward
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.norm3 = nn.LayerNorm(dim)
+        self.use_ada_layer_norm_zero = False
+        # Temp-Attn
+        assert unet_use_temporal_attention is not None
+        if unet_use_temporal_attention:
+            self.attn_temp = CrossAttention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+            nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
+            self.norm_temp = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool, *args, **kwargs):
+        if not is_xformers_available():
+            print("Here is how to install it")
+            raise ModuleNotFoundError(
+                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                " xformers",
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+                " available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            if self.attn2 is not None:
+                self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            # self.attn_temp._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None, video_length=None):
+        # SparseCausal-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+        )
+        # if self.only_cross_attention:
+        #     hidden_states = (
+        #         self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states
+        #     )
+        # else:
+        #     hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
+        # pdb.set_trace()
+        if self.unet_use_cross_frame_attention:
+            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
+        else:
+            hidden_states = self.attn1(norm_hidden_states, attention_mask=attention_mask) + hidden_states
+        if self.attn2 is not None:
+            # Cross-Attention
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            hidden_states = (
+                self.attn2(
+                    norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+                )
+                + hidden_states
+            )
+        # Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        # Temporal-Attention
+        if self.unet_use_temporal_attention:
+            d = hidden_states.shape[1]
+            hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+            norm_hidden_states = (
+                self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)
+            )
+            hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
+            hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+        return hidden_states

src/multiview_consist_edit/models/condition_encoder.py ADDED Viewed

	@@ -0,0 +1,395 @@

+import torch
+import torch.nn as nn
+import kornia
+import open_clip
+from torch.utils.checkpoint import checkpoint
+from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel
+# from lvdm.common import autocast
+# from utils.utils import count_params
+# from https://github.com/Doubiiu/DynamiCrafter/blob/main/lvdm/modules/encoders/condition.py
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+class IdentityEncoder(AbstractEncoder):
+    def encode(self, x):
+        return x
+class ClassEmbedder(nn.Module):
+    def __init__(self, embed_dim, n_classes=1000, key='class', ucg_rate=0.1):
+        super().__init__()
+        self.key = key
+        self.embedding = nn.Embedding(n_classes, embed_dim)
+        self.n_classes = n_classes
+        self.ucg_rate = ucg_rate
+    def forward(self, batch, key=None, disable_dropout=False):
+        if key is None:
+            key = self.key
+        # this is for use in crossattn
+        c = batch[key][:, None]
+        if self.ucg_rate > 0. and not disable_dropout:
+            mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate)
+            c = mask * c + (1 - mask) * torch.ones_like(c) * (self.n_classes - 1)
+            c = c.long()
+        c = self.embedding(c)
+        return c
+    def get_unconditional_conditioning(self, bs, device="cuda"):
+        uc_class = self.n_classes - 1  # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
+        uc = torch.ones((bs,), device=device) * uc_class
+        uc = {self.key: uc}
+        return uc
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class FrozenT5Embedder(AbstractEncoder):
+    """Uses the T5 transformer encoder for text"""
+    def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77,
+                 freeze=True):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
+        super().__init__()
+        self.tokenizer = T5Tokenizer.from_pretrained(version)
+        self.transformer = T5EncoderModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length  # TODO: typical value?
+        if freeze:
+            self.freeze()
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        # self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        return z
+    def encode(self, text):
+        return self(text)
+class FrozenCLIPEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    LAYERS = [
+        "last",
+        "pooled",
+        "hidden"
+    ]
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77,
+                 freeze=True, layer="last", layer_idx=None):  # clip-vit-base-patch32
+        super().__init__()
+        assert layer in self.LAYERS
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        self.layer_idx = layer_idx
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert 0 <= abs(layer_idx) <= 12
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        # self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer == "hidden")
+        if self.layer == "last":
+            z = outputs.last_hidden_state
+        elif self.layer == "pooled":
+            z = outputs.pooler_output[:, None, :]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
+        return z
+    def encode(self, text):
+        return self(text)
+class ClipImageEmbedder(nn.Module):
+    def __init__(
+            self,
+            model,
+            jit=False,
+            device='cuda' if torch.cuda.is_available() else 'cpu',
+            antialias=True,
+            ucg_rate=0.
+    ):
+        super().__init__()
+        from clip import load as load_clip
+        self.model, _ = load_clip(name=model, device=device, jit=jit)
+        self.antialias = antialias
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+        self.ucg_rate = ucg_rate
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(x, (224, 224),
+                                   interpolation='bicubic', align_corners=True,
+                                   antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # re-normalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+    def forward(self, x, no_dropout=False):
+        # x is assumed to be in range [-1,1]
+        out = self.model.encode_image(self.preprocess(x))
+        out = out.to(x.dtype)
+        if self.ucg_rate > 0. and not no_dropout:
+            out = torch.bernoulli((1. - self.ucg_rate) * torch.ones(out.shape[0], device=out.device))[:, None] * out
+        return out
+class FrozenOpenCLIPEmbedder(AbstractEncoder):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    LAYERS = [
+        # "pooled",
+        "last",
+        "penultimate"
+    ]
+    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
+                 freeze=True, layer="last"):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version)
+        del model.visual
+        self.model = model
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        tokens = open_clip.tokenize(text) ## all clip models use 77 as context length
+        z = self.encode_with_transformer(tokens.to(self.device))
+        return z
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        return x
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+    def encode(self, text):
+        return self(text)
+class FrozenOpenCLIPImageEmbedder(AbstractEncoder):
+    """
+    Uses the OpenCLIP vision transformer encoder for images
+    """
+    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
+                 freeze=True, layer="pooled", antialias=True, ucg_rate=0.):
+        super().__init__()
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'),
+                                                            pretrained=version, )
+        del model.transformer
+        self.model = model
+        # self.mapper = torch.nn.Linear(1280, 1024)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "penultimate":
+            raise NotImplementedError()
+            self.layer_idx = 1
+        self.antialias = antialias
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+        self.ucg_rate = ucg_rate
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(x, (224, 224),
+                                   interpolation='bicubic', align_corners=True,
+                                   antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
+    # @autocast
+    def forward(self, image, no_dropout=False):
+        z = self.encode_with_vision_transformer(image)
+        if self.ucg_rate > 0. and not no_dropout:
+            z = torch.bernoulli((1. - self.ucg_rate) * torch.ones(z.shape[0], device=z.device))[:, None] * z
+        return z
+    def encode_with_vision_transformer(self, img):
+        img = self.preprocess(img)
+        x = self.model.visual(img)
+        return x
+    def encode(self, text):
+        return self(text)
+class FrozenOpenCLIPImageEmbedderV2(AbstractEncoder):
+    """
+    Uses the OpenCLIP vision transformer encoder for images
+    """
+    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda",
+                 freeze=True, layer="pooled", antialias=True, model_path=None):
+        super().__init__()
+        if model_path is None:
+            model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'),
+                                                            pretrained=version, )
+        else:
+            model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'),
+                                                            pretrained=model_path)
+        del model.transformer
+        self.model = model
+        self.device = device
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "penultimate":
+            raise NotImplementedError()
+            self.layer_idx = 1
+        self.antialias = antialias
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(x, (224, 224),
+                                   interpolation='bicubic', align_corners=True,
+                                   antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
+    def forward(self, image, no_dropout=False):
+        ## image: b c h w
+        z = self.encode_with_vision_transformer(image)
+        return z
+    def encode_with_vision_transformer(self, x):
+        x = self.preprocess(x)
+        # to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1
+        if self.model.visual.input_patchnorm:
+            # einops - rearrange(x, 'b c (h p1) (w p2) -> b (h w) (c p1 p2)')
+            x = x.reshape(x.shape[0], x.shape[1], self.model.visual.grid_size[0], self.model.visual.patch_size[0], self.model.visual.grid_size[1], self.model.visual.patch_size[1])
+            x = x.permute(0, 2, 4, 1, 3, 5)
+            x = x.reshape(x.shape[0], self.model.visual.grid_size[0] * self.model.visual.grid_size[1], -1)
+            x = self.model.visual.patchnorm_pre_ln(x)
+            x = self.model.visual.conv1(x)
+        else:
+            x = self.model.visual.conv1(x)  # shape = [*, width, grid, grid]
+            x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+            x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        # class embeddings and positional embeddings
+        x = torch.cat(
+            [self.model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
+             x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.model.visual.positional_embedding.to(x.dtype)
+        # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
+        x = self.model.visual.patch_dropout(x)
+        x = self.model.visual.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.model.visual.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        return x
+class FrozenCLIPT5Encoder(AbstractEncoder):
+    def __init__(self, clip_version="openai/clip-vit-large-patch14", t5_version="google/t5-v1_1-xl", device="cuda",
+                 clip_max_length=77, t5_max_length=77):
+        super().__init__()
+        self.clip_encoder = FrozenCLIPEmbedder(clip_version, device, max_length=clip_max_length)
+        self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length)
+        print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder) * 1.e-6:.2f} M parameters, "
+              f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder) * 1.e-6:.2f} M params.")
+    def encode(self, text):
+        return self(text)
+    def forward(self, text):
+        clip_z = self.clip_encoder.encode(text)
+        t5_z = self.t5_encoder.encode(text)
+        return [clip_z, t5_z]

src/multiview_consist_edit/models/embeddings.py ADDED Viewed

	@@ -0,0 +1,385 @@

+# *************************************************************************
+# This file may have been modified by Bytedance Inc. (“Bytedance Inc.'s Mo-
+# difications”). All Bytedance Inc.'s Modifications are Copyright (2023) B-
+# ytedance Inc..
+# *************************************************************************
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional
+import numpy as np
+import torch
+from torch import nn
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+    def __init__(
+        self,
+        height=224,
+        width=224,
+        patch_size=16,
+        in_channels=3,
+        embed_dim=768,
+        layer_norm=False,
+        flatten=True,
+        bias=True,
+    ):
+        super().__init__()
+        num_patches = (height // patch_size) * (width // patch_size)
+        self.flatten = flatten
+        self.layer_norm = layer_norm
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        if layer_norm:
+            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm = None
+        pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
+        self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+    def forward(self, latent):
+        latent = self.proj(latent)
+        if self.flatten:
+            latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        if self.layer_norm:
+            latent = self.norm(latent)
+        return latent + self.pos_embed
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        if act_fn == "silu":
+            self.act = nn.SiLU()
+        elif act_fn == "mish":
+            self.act = nn.Mish()
+        elif act_fn == "gelu":
+            self.act = nn.GELU()
+        else:
+            raise ValueError(f"{act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'")
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)
+        if post_act_fn is None:
+            self.post_act = None
+        elif post_act_fn == "silu":
+            self.post_act = nn.SiLU()
+        elif post_act_fn == "mish":
+            self.post_act = nn.Mish()
+        elif post_act_fn == "gelu":
+            self.post_act = nn.GELU()
+        else:
+            raise ValueError(f"{post_act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'")
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
+class GaussianFourierProjection(nn.Module):
+    """Gaussian Fourier embeddings for noise levels."""
+    def __init__(
+        self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
+    ):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+        self.log = log
+        self.flip_sin_to_cos = flip_sin_to_cos
+        if set_W_to_weight:
+            # to delete later
+            self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+            self.weight = self.W
+    def forward(self, x):
+        if self.log:
+            x = torch.log(x)
+        x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
+        if self.flip_sin_to_cos:
+            out = torch.cat([torch.cos(x_proj), torch.sin(x_proj)], dim=-1)
+        else:
+            out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+        return out
+class ImagePositionalEmbeddings(nn.Module):
+    """
+    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
+    height and width of the latent space.
+    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
+    For VQ-diffusion:
+    Output vector embeddings are used as input for the transformer.
+    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.
+    Args:
+        num_embed (`int`):
+            Number of embeddings for the latent pixels embeddings.
+        height (`int`):
+            Height of the latent image i.e. the number of height embeddings.
+        width (`int`):
+            Width of the latent image i.e. the number of width embeddings.
+        embed_dim (`int`):
+            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
+    """
+    def __init__(
+        self,
+        num_embed: int,
+        height: int,
+        width: int,
+        embed_dim: int,
+    ):
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.num_embed = num_embed
+        self.embed_dim = embed_dim
+        self.emb = nn.Embedding(self.num_embed, embed_dim)
+        self.height_emb = nn.Embedding(self.height, embed_dim)
+        self.width_emb = nn.Embedding(self.width, embed_dim)
+    def forward(self, index):
+        emb = self.emb(index)
+        height_emb = self.height_emb(torch.arange(self.height, device=index.device).view(1, self.height))
+        # 1 x H x D -> 1 x H x 1 x D
+        height_emb = height_emb.unsqueeze(2)
+        width_emb = self.width_emb(torch.arange(self.width, device=index.device).view(1, self.width))
+        # 1 x W x D -> 1 x 1 x W x D
+        width_emb = width_emb.unsqueeze(1)
+        pos_emb = height_emb + width_emb
+        # 1 x H x W x D -> 1 x L xD
+        pos_emb = pos_emb.view(1, self.height * self.width, -1)
+        emb = emb + pos_emb[:, : emb.shape[1], :]
+        return emb
+class LabelEmbedding(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    Args:
+        num_classes (`int`): The number of classes.
+        hidden_size (`int`): The size of the vector embeddings.
+        dropout_prob (`float`): The probability of dropping a label.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = torch.tensor(force_drop_ids == 1)
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (self.training and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+class CombinedTimestepLabelEmbeddings(nn.Module):
+    def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.class_embedder = LabelEmbedding(num_classes, embedding_dim, class_dropout_prob)
+    def forward(self, timestep, class_labels, hidden_dtype=None):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+        class_labels = self.class_embedder(class_labels)  # (N, D)
+        conditioning = timesteps_emb + class_labels  # (N, D)
+        return conditioning

src/multiview_consist_edit/models/hack_poseguider.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+from einops import rearrange
+import numpy as np
+class Hack_PoseGuider(nn.Module):
+    def __init__(self, noise_latent_channels=320):
+        super(Hack_PoseGuider, self).__init__()
+        self.conv_layers = nn.Sequential(
+            nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3, padding=1),
+            nn.BatchNorm2d(3),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=4, stride=2, padding=1),
+            nn.BatchNorm2d(16),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, padding=1),
+            nn.BatchNorm2d(16),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=4, stride=2, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, padding=1),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(128),
+            nn.ReLU()
+        )
+        # Final projection layer
+        self.final_proj = nn.Conv2d(in_channels=128, out_channels=noise_latent_channels, kernel_size=1)
+        # Initialize layers
+        self._initialize_weights()
+        self.scale = nn.Parameter(torch.ones(1) * 2)
+    # def _initialize_weights(self):
+    #     # Initialize weights with Gaussian distribution and zero out the final layer
+    #     for m in self.conv_layers:
+    #         if isinstance(m, nn.Conv2d):
+    #             init.normal_(m.weight, mean=0.0, std=0.02)
+    #             if m.bias is not None:
+    #                 init.zeros_(m.bias)
+    #     init.zeros_(self.final_proj.weight)
+    #     if self.final_proj.bias is not None:
+    #         init.zeros_(self.final_proj.bias)
+    def _initialize_weights(self):
+        # Initialize weights with He initialization and zero out the biases
+        for m in self.conv_layers:
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
+                init.normal_(m.weight, mean=0.0, std=np.sqrt(2. / n))
+                if m.bias is not None:
+                    init.zeros_(m.bias)
+        # For the final projection layer, initialize weights to zero (or you may choose to use He initialization here as well)
+        init.zeros_(self.final_proj.weight)
+        if self.final_proj.bias is not None:
+            init.zeros_(self.final_proj.bias)
+    def forward(self, x):
+        x = self.conv_layers(x)
+        x = self.final_proj(x)
+        return x * self.scale
+    @classmethod
+    def from_pretrained(cls,pretrained_model_path):
+        if not os.path.exists(pretrained_model_path):
+            print(f"There is no model file in {pretrained_model_path}")
+        print(f"loaded PoseGuider's pretrained weights from {pretrained_model_path} ...")
+        state_dict = torch.load(pretrained_model_path, map_location="cpu")
+        model = Hack_PoseGuider(noise_latent_channels=320)
+        m, u = model.load_state_dict(state_dict, strict=False)
+        # print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+        params = [p.numel() for n, p in model.named_parameters()]
+        print(f"### PoseGuider's Parameters: {sum(params) / 1e6} M")
+        return model

src/multiview_consist_edit/models/hack_unet2d.py ADDED Viewed

	@@ -0,0 +1,329 @@

+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+# from diffusers import UNet2DConditionModel
+from diffusers.models.unet_2d_condition import UNet2DConditionModel,UNet2DConditionOutput,logger
+class Hack_UNet2DConditionModel(UNet2DConditionModel):
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        latent_pose: torch.Tensor, # new add
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # add latent_pose
+        sample = sample + latent_pose
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0)
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
+                if is_adapter and len(down_block_additional_residuals) > 0:
+                    sample += down_block_additional_residuals.pop(0)
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_block_additional_residuals) > 0
+                and sample.shape == down_block_additional_residuals[0].shape
+            ):
+                sample += down_block_additional_residuals.pop(0)
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)

src/multiview_consist_edit/models/mv_attn_processor.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from diffusers.utils import USE_PEFT_BACKEND
+from typing import Callable, Optional
+import torch
+from diffusers.models.attention_processor import Attention
+from diffusers.utils.import_utils import is_xformers_available
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class MVXFormersAttnProcessor:
+    r"""
+    Processor for implementing memory efficient attention using xFormers.
+    Args:
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+    """
+    def __init__(self, weight_matrix=None, attention_op: Optional[Callable] = None):
+        if weight_matrix:
+            self.bs = weight_matrix.shape[0]
+            self.frame_length = weight_matrix.shape[1]
+            self.weight_matrix = weight_matrix
+        self.attention_op = attention_op
+    def update_weight_matrix(self, weight_matrix):
+        self.bs = weight_matrix.shape[0]
+        self.frame_length = weight_matrix.shape[1]
+        self.weight_matrix = weight_matrix
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        garment_fea_attn = True,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, key_tokens, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size)
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+        attn_out = torch.empty_like(query)
+        if garment_fea_attn:
+            frame_length = self.frame_length + 2 # 2 for two garments
+        else:
+            frame_length = self.frame_length
+        token_num_per_frame = query.shape[1] // frame_length
+        # print('000000',query.shape,frame_length)
+        heads_num = attn.heads
+        for b in range(self.bs):
+            for i in range(self.frame_length):
+                curr_q = query[heads_num*b:heads_num*(b+1),token_num_per_frame*i:token_num_per_frame*(i+1),:]
+                weight = self.weight_matrix[b,i,:]
+                if garment_fea_attn:
+                    weight = torch.cat([weight,torch.tensor([1,1],dtype=weight.dtype,device=weight.device)],dim=0)  # garment's attn weight set 1
+                weight = weight.repeat_interleave(token_num_per_frame)
+                curr_k = key[heads_num*b:heads_num*(b+1)]
+                curr_v = value[heads_num*b:heads_num*(b+1)]
+                weight = weight.unsqueeze(0).unsqueeze(-1)
+                curr_k = weight * curr_k
+                hidden_states = xformers.ops.memory_efficient_attention(
+                    curr_q, curr_k, curr_v, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+                )
+                attn_out[heads_num*b:heads_num*(b+1),token_num_per_frame*i:token_num_per_frame*(i+1),:] = hidden_states
+        hidden_states = attn_out
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

src/multiview_consist_edit/models/resnet.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# *************************************************************************
+# This file may have been modified by Bytedance Inc. (“Bytedance Inc.'s Mo-
+# difications”). All Bytedance Inc.'s Modifications are Copyright (2023) B-
+# ytedance Inc..
+# *************************************************************************
+# Adapted from https://github.com/guoyww/AnimateDiff
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+# `TemporalConvLayer` Copyright 2023 Alibaba DAMO-VILAB, The ModelScope Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+class InflatedConv3d(nn.Conv2d):
+    def forward(self, x):
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+class Upsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        conv = None
+        if use_conv_transpose:
+            raise NotImplementedError
+        elif use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
+    def forward(self, hidden_states, output_size=None):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv_transpose:
+            raise NotImplementedError
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if output_size is None:
+            hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
+        else:
+            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class Downsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+        if use_conv:
+            self.conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            raise NotImplementedError
+    def forward(self, hidden_states):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv and self.padding == 0:
+            raise NotImplementedError
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class ResnetBlock3D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        use_in_shortcut=None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                time_emb_proj_out_channels = out_channels
+            elif self.time_embedding_norm == "scale_shift":
+                time_emb_proj_out_channels = out_channels * 2
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, input_tensor, temb):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+class Mish(torch.nn.Module):
+    def forward(self, hidden_states):
+        return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))

src/multiview_consist_edit/models/unet.py ADDED Viewed

	@@ -0,0 +1,523 @@

+# *************************************************************************
+# This file may have been modified by Bytedance Inc. (“Bytedance Inc.'s Mo-
+# difications”). All Bytedance Inc.'s Modifications are Copyright (2023) B-
+# ytedance Inc..
+# *************************************************************************
+# Adapted from https://github.com/guoyww/AnimateDiff
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import os
+import json
+import pdb
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from .unet_3d_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+from .resnet import InflatedConv3d
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    sample: torch.FloatTensor
+class UNet3DConditionModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        mid_block_type: str = "UNetMidBlock3DCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D"
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        # Additional
+        use_motion_module              = False,
+        motion_module_resolutions      = ( 1,2,4,8 ),
+        motion_module_mid_block        = False,
+        motion_module_decoder_only     = False,
+        motion_module_type             = None,
+        motion_module_kwargs           = {},
+        unet_use_cross_frame_attention = None,
+        unet_use_temporal_attention    = None,
+        encoder_hid_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # input
+        self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        if encoder_hid_dim is not None:
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            res = 2 ** i
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_motion_module=use_motion_module and (res in motion_module_resolutions) and (not motion_module_decoder_only),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock3DCrossAttn":
+            self.mid_block = UNetMidBlock3DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_motion_module=use_motion_module and motion_module_mid_block,
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the videos
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            res = 2 ** (3 - i)
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                unet_use_cross_frame_attention=unet_use_cross_frame_attention,
+                unet_use_temporal_attention=unet_use_temporal_attention,
+                use_motion_module=use_motion_module and (res in motion_module_resolutions),
+                motion_module_type=motion_module_type,
+                motion_module_kwargs=motion_module_kwargs,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+        num_slicable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        if self.encoder_hid_proj is not None:
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        # pre-process
+        sample = self.conv_in(sample)
+        # down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, encoder_hidden_states=encoder_hidden_states)
+            down_block_res_samples += res_samples
+        # mid
+        sample = self.mid_block(
+            sample, emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+        )
+        # up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size, encoder_hidden_states=encoder_hidden_states,
+                )
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained_2d(cls, pretrained_model_path, subfolder=None, unet_additional_kwargs=None):
+        if subfolder is not None:
+            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
+        print(f"loaded temporal unet's pretrained weights from {pretrained_model_path} ...")
+        config_file = os.path.join(pretrained_model_path, 'config.json')
+        if not os.path.isfile(config_file):
+            raise RuntimeError(f"{config_file} does not exist")
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        config["_class_name"] = cls.__name__
+        config["down_block_types"] = [
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D"
+        ]
+        config["up_block_types"] = [
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D"
+        ]
+        config["mid_block_type"] = "UNetMidBlock3DCrossAttn"
+        from diffusers.utils import WEIGHTS_NAME
+        # 用于加载accelerator存的模型
+        import safetensors
+        WEIGHTS_NAME = "diffusion_pytorch_model.safetensors"
+        model = cls.from_config(config, **unet_additional_kwargs)
+        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
+        if not os.path.isfile(model_file):
+            raise RuntimeError(f"{model_file} does not exist")
+        # state_dict = torch.load(model_file, map_location="cpu")
+        state_dict = safetensors.torch.load_file(
+            model_file, device="cpu"
+        )
+        m, u = model.load_state_dict(state_dict, strict=False)
+        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+        # print(f"### missing keys:\n{m}\n### unexpected keys:\n{u}\n")
+        params = [p.numel() if "temporal" in n else 0 for n, p in model.named_parameters()]
+        print(f"### Temporal Module Parameters: {sum(params) / 1e6} M")
+        return model

src/multiview_consist_edit/parse_tool/postprocess_parse.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import sys
+# sys.path.append('./')
+from PIL import Image
+from preprocess.humanparsing.run_parsing import Parsing
+from preprocess.openpose.run_openpose import OpenPose
+import os
+import torch
+from torchvision import transforms
+from torchvision.transforms.functional import to_pil_image
+import argparse
+device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='script')
+    # 添加参数
+    parser.add_argument('root', type=str)
+    # 解析参数
+    args = parser.parse_args()
+    # root = '/GPUFS/sysu_gbli2_1/hzj/animate/output/image_output_tryon_1025_22000_test_multi_3_all2_mvg_back/'
+    root = args.root
+    parsing_model = Parsing(0)
+    cloth_ids = os.listdir(root)
+    for cloth_subroot in cloth_ids[:]:
+        print(cloth_subroot)
+        images = os.listdir(os.path.join(root, cloth_subroot))
+        for image in images:
+            if 'cond' in image or 'parse' in image:
+                continue
+            human_img_path = os.path.join(root, cloth_subroot, image)
+            human_img = Image.open(human_img_path)
+            model_parse, _ = parsing_model(human_img.resize((384,512)))
+            model_parse = model_parse.resize((576,768))
+            model_parse_path = os.path.join(root, cloth_subroot, 'parse_'+image.replace('jpg','png'))
+            # print(model_parse_path)
+            model_parse.save(model_parse_path)

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/datasets/__init__.py ADDED Viewed

File without changes

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/datasets/datasets.py ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   datasets.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import os
+import numpy as np
+import random
+import torch
+import cv2
+from torch.utils import data
+from utils.transforms import get_affine_transform
+class LIPDataSet(data.Dataset):
+    def __init__(self, root, dataset, crop_size=[473, 473], scale_factor=0.25,
+                 rotation_factor=30, ignore_label=255, transform=None):
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+        self.ignore_label = ignore_label
+        self.scale_factor = scale_factor
+        self.rotation_factor = rotation_factor
+        self.flip_prob = 0.5
+        self.transform = transform
+        self.dataset = dataset
+        list_path = os.path.join(self.root, self.dataset + '_id.txt')
+        train_list = [i_id.strip() for i_id in open(list_path)]
+        self.train_list = train_list
+        self.number_samples = len(self.train_list)
+    def __len__(self):
+        return self.number_samples
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+        return center, scale
+    def __getitem__(self, index):
+        train_item = self.train_list[index]
+        im_path = os.path.join(self.root, self.dataset + '_images', train_item + '.jpg')
+        parsing_anno_path = os.path.join(self.root, self.dataset + '_segmentations', train_item + '.png')
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        parsing_anno = np.zeros((h, w), dtype=np.long)
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        if self.dataset != 'test':
+            # Get pose annotation
+            parsing_anno = cv2.imread(parsing_anno_path, cv2.IMREAD_GRAYSCALE)
+            if self.dataset == 'train' or self.dataset == 'trainval':
+                sf = self.scale_factor
+                rf = self.rotation_factor
+                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+                r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0
+                if random.random() <= self.flip_prob:
+                    im = im[:, ::-1, :]
+                    parsing_anno = parsing_anno[:, ::-1]
+                    person_center[0] = im.shape[1] - person_center[0] - 1
+                    right_idx = [15, 17, 19]
+                    left_idx = [14, 16, 18]
+                    for i in range(0, 3):
+                        right_pos = np.where(parsing_anno == right_idx[i])
+                        left_pos = np.where(parsing_anno == left_idx[i])
+                        parsing_anno[right_pos[0], right_pos[1]] = left_idx[i]
+                        parsing_anno[left_pos[0], left_pos[1]] = right_idx[i]
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        if self.transform:
+            input = self.transform(input)
+        meta = {
+            'name': train_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        if self.dataset == 'val' or self.dataset == 'test':
+            return input, meta
+        else:
+            label_parsing = cv2.warpAffine(
+                parsing_anno,
+                trans,
+                (int(self.crop_size[1]), int(self.crop_size[0])),
+                flags=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=(255))
+            label_parsing = torch.from_numpy(label_parsing)
+            return input, label_parsing, meta
+class LIPDataValSet(data.Dataset):
+    def __init__(self, root, dataset='val', crop_size=[473, 473], transform=None, flip=False):
+        self.root = root
+        self.crop_size = crop_size
+        self.transform = transform
+        self.flip = flip
+        self.dataset = dataset
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+        list_path = os.path.join(self.root, self.dataset + '_id.txt')
+        val_list = [i_id.strip() for i_id in open(list_path)]
+        self.val_list = val_list
+        self.number_samples = len(self.val_list)
+    def __len__(self):
+        return len(self.val_list)
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+        return center, scale
+    def __getitem__(self, index):
+        val_item = self.val_list[index]
+        # Load training image
+        im_path = os.path.join(self.root, self.dataset + '_images', val_item + '.jpg')
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        input = self.transform(input)
+        flip_input = input.flip(dims=[-1])
+        if self.flip:
+            batch_input_im = torch.stack([input, flip_input])
+        else:
+            batch_input_im = input
+        meta = {
+            'name': val_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        return batch_input_im, meta

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/datasets/simple_extractor_dataset.py ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   dataset.py
+@Time    :   8/30/19 9:12 PM
+@Desc    :   Dataset Definition
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import os
+import pdb
+import cv2
+import numpy as np
+from PIL import Image
+from torch.utils import data
+from utils.transforms import get_affine_transform
+class SimpleFolderDataset(data.Dataset):
+    def __init__(self, root, input_size=[512, 512], transform=None):
+        self.root = root
+        self.input_size = input_size
+        self.transform = transform
+        self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
+        self.input_size = np.asarray(input_size)
+        self.is_pil_image = False
+        if isinstance(root, Image.Image):
+            self.file_list = [root]
+            self.is_pil_image = True
+        elif os.path.isfile(root):
+            self.file_list = [os.path.basename(root)]
+            self.root = os.path.dirname(root)
+        else:
+            self.file_list = os.listdir(self.root)
+    def __len__(self):
+        return len(self.file_list)
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w, h], dtype=np.float32)
+        return center, scale
+    def __getitem__(self, index):
+        if self.is_pil_image:
+            img = np.asarray(self.file_list[index])[:, :, [2, 1, 0]]
+        else:
+            img_name = self.file_list[index]
+            img_path = os.path.join(self.root, img_name)
+            img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+        h, w, _ = img.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.input_size)
+        input = cv2.warpAffine(
+            img,
+            trans,
+            (int(self.input_size[1]), int(self.input_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        input = self.transform(input)
+        meta = {
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        return input, meta

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/datasets/target_generation.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch.nn import functional as F
+def generate_edge_tensor(label, edge_width=3):
+    label = label.type(torch.cuda.FloatTensor)
+    if len(label.shape) == 2:
+        label = label.unsqueeze(0)
+    n, h, w = label.shape
+    edge = torch.zeros(label.shape, dtype=torch.float).cuda()
+    # right
+    edge_right = edge[:, 1:h, :]
+    edge_right[(label[:, 1:h, :] != label[:, :h - 1, :]) & (label[:, 1:h, :] != 255)
+               & (label[:, :h - 1, :] != 255)] = 1
+    # up
+    edge_up = edge[:, :, :w - 1]
+    edge_up[(label[:, :, :w - 1] != label[:, :, 1:w])
+            & (label[:, :, :w - 1] != 255)
+            & (label[:, :, 1:w] != 255)] = 1
+    # upright
+    edge_upright = edge[:, :h - 1, :w - 1]
+    edge_upright[(label[:, :h - 1, :w - 1] != label[:, 1:h, 1:w])
+                 & (label[:, :h - 1, :w - 1] != 255)
+                 & (label[:, 1:h, 1:w] != 255)] = 1
+    # bottomright
+    edge_bottomright = edge[:, :h - 1, 1:w]
+    edge_bottomright[(label[:, :h - 1, 1:w] != label[:, 1:h, :w - 1])
+                     & (label[:, :h - 1, 1:w] != 255)
+                     & (label[:, 1:h, :w - 1] != 255)] = 1
+    kernel = torch.ones((1, 1, edge_width, edge_width), dtype=torch.float).cuda()
+    with torch.no_grad():
+        edge = edge.unsqueeze(1)
+        edge = F.conv2d(edge, kernel, stride=1, padding=1)
+    edge[edge!=0] = 1
+    edge = edge.squeeze()
+    return edge

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .bn import ABN, InPlaceABN, InPlaceABNSync
+from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
+from .misc import GlobalAvgPool2d, SingleGPU
+from .residual import IdentityResidualBlock
+from .dense import DenseModule

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/bn.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+try:
+    from queue import Queue
+except ImportError:
+    from Queue import Queue
+from .functions import *
+class ABN(nn.Module):
+    """Activated Batch Normalization
+    This gathers a `BatchNorm2d` and an activation function in a single module
+    """
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an Activated Batch Normalization module
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(ABN, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        self.activation = activation
+        self.slope = slope
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(num_features))
+            self.bias = nn.Parameter(torch.zeros(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.constant_(self.running_mean, 0)
+        nn.init.constant_(self.running_var, 1)
+        if self.affine:
+            nn.init.constant_(self.weight, 1)
+            nn.init.constant_(self.bias, 0)
+    def forward(self, x):
+        x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
+                                  self.training, self.momentum, self.eps)
+        if self.activation == ACT_RELU:
+            return functional.relu(x, inplace=True)
+        elif self.activation == ACT_LEAKY_RELU:
+            return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
+        elif self.activation == ACT_ELU:
+            return functional.elu(x, inplace=True)
+        else:
+            return x
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)
+class InPlaceABN(ABN):
+    """InPlace Activated Batch Normalization"""
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an InPlace Activated Batch Normalization module
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
+    def forward(self, x):
+        x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
+                           self.training, self.momentum, self.eps, self.activation, self.slope)
+        return x
+class InPlaceABNSync(ABN):
+    """InPlace Activated Batch Normalization with cross-GPU synchronization
+    This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
+    """
+    def forward(self, x):
+        x, _, _ =  inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
+                                   self.training, self.momentum, self.eps, self.activation, self.slope)
+        return x
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/deeplab.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+from models._util import try_index
+from .bn import ABN
+class DeeplabV3(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 hidden_channels=256,
+                 dilations=(12, 24, 36),
+                 norm_act=ABN,
+                 pooling_size=None):
+        super(DeeplabV3, self).__init__()
+        self.pooling_size = pooling_size
+        self.map_convs = nn.ModuleList([
+            nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
+        ])
+        self.map_bn = norm_act(hidden_channels * 4)
+        self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
+        self.global_pooling_bn = norm_act(hidden_channels)
+        self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
+        self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
+        self.red_bn = norm_act(out_channels)
+        self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
+    def reset_parameters(self, activation, slope):
+        gain = nn.init.calculate_gain(activation, slope)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_normal_(m.weight.data, gain)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, ABN):
+                if hasattr(m, "weight") and m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        # Map convolutions
+        out = torch.cat([m(x) for m in self.map_convs], dim=1)
+        out = self.map_bn(out)
+        out = self.red_conv(out)
+        # Global pooling
+        pool = self._global_pooling(x)
+        pool = self.global_pooling_conv(pool)
+        pool = self.global_pooling_bn(pool)
+        pool = self.pool_red_conv(pool)
+        if self.training or self.pooling_size is None:
+            pool = pool.repeat(1, 1, x.size(2), x.size(3))
+        out += pool
+        out = self.red_bn(out)
+        return out
+    def _global_pooling(self, x):
+        if self.training or self.pooling_size is None:
+            pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
+            pool = pool.view(x.size(0), x.size(1), 1, 1)
+        else:
+            pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
+                            min(try_index(self.pooling_size, 1), x.shape[3]))
+            padding = (
+                (pooling_size[1] - 1) // 2,
+                (pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
+                (pooling_size[0] - 1) // 2,
+                (pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
+            )
+            pool = functional.avg_pool2d(x, pooling_size, stride=1)
+            pool = functional.pad(pool, pad=padding, mode="replicate")
+        return pool

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/dense.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from collections import OrderedDict
+import torch
+import torch.nn as nn
+from .bn import ABN
+class DenseModule(nn.Module):
+    def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
+        super(DenseModule, self).__init__()
+        self.in_channels = in_channels
+        self.growth = growth
+        self.layers = layers
+        self.convs1 = nn.ModuleList()
+        self.convs3 = nn.ModuleList()
+        for i in range(self.layers):
+            self.convs1.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(in_channels)),
+                ("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
+            ])))
+            self.convs3.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(self.growth * bottleneck_factor)),
+                ("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
+                                   dilation=dilation))
+            ])))
+            in_channels += self.growth
+    @property
+    def out_channels(self):
+        return self.in_channels + self.growth * self.layers
+    def forward(self, x):
+        inputs = [x]
+        for i in range(self.layers):
+            x = torch.cat(inputs, dim=1)
+            x = self.convs1[i](x)
+            x = self.convs3[i](x)
+            inputs += [x]
+        return torch.cat(inputs, dim=1)

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/functions.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import pdb
+from os import path
+import torch
+import torch.distributed as dist
+import torch.autograd as autograd
+import torch.cuda.comm as comm
+from torch.autograd.function import once_differentiable
+from torch.utils.cpp_extension import load
+_src_path = path.join(path.dirname(path.abspath(__file__)), "src")
+_backend = load(name="inplace_abn",
+                extra_cflags=["-O3"],
+                sources=[path.join(_src_path, f) for f in [
+                    "inplace_abn.cpp",
+                    "inplace_abn_cpu.cpp",
+                    "inplace_abn_cuda.cu",
+                    "inplace_abn_cuda_half.cu"
+                ]],
+                extra_cuda_cflags=["--expt-extended-lambda"])
+# Activation names
+ACT_RELU = "relu"
+ACT_LEAKY_RELU = "leaky_relu"
+ACT_ELU = "elu"
+ACT_NONE = "none"
+def _check(fn, *args, **kwargs):
+    success = fn(*args, **kwargs)
+    if not success:
+        raise RuntimeError("CUDA Error encountered in {}".format(fn))
+def _broadcast_shape(x):
+    out_size = []
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            out_size.append(1)
+        else:
+            out_size.append(s)
+    return out_size
+def _reduce(x):
+    if len(x.size()) == 2:
+        return x.sum(dim=0)
+    else:
+        n, c = x.size()[0:2]
+        return x.contiguous().view((n, c, -1)).sum(2).sum(0)
+def _count_samples(x):
+    count = 1
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            count *= s
+    return count
+def _act_forward(ctx, x):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_forward(x, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_forward(x)
+    elif ctx.activation == ACT_NONE:
+        pass
+def _act_backward(ctx, x, dx):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_backward(x, dx, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_backward(x, dx)
+    elif ctx.activation == ACT_NONE:
+        pass
+class InPlaceABN(autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+        # Prepare inputs
+        count = _count_samples(x)
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        ctx.mark_non_differentiable(running_mean, running_var)
+        return x, running_mean, running_var
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz, _drunning_mean, _drunning_var):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+        # Undo activation
+        _act_backward(ctx, z, dz)
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+        else:
+            # TODO: implement simplified CUDA backward for inference mode
+            edz = dz.new_zeros(dz.size(1))
+            eydz = dz.new_zeros(dz.size(1))
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        # dweight = eydz * weight.sign() if ctx.affine else None
+        dweight = eydz if ctx.affine else None
+        if dweight is not None:
+            dweight[weight < 0] *= -1
+        dbias = edz if ctx.affine else None
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+class InPlaceABNSync(autograd.Function):
+    @classmethod
+    def forward(cls, ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+        # Prepare inputs
+        ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
+        # count = _count_samples(x)
+        batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+            if ctx.world_size > 1:
+                # get global batch size
+                if equal_batches:
+                    batch_size *= ctx.world_size
+                else:
+                    dist.all_reduce(batch_size, dist.ReduceOp.SUM)
+                ctx.factor = x.shape[0] / float(batch_size.item())
+                mean_all = mean.clone() * ctx.factor
+                dist.all_reduce(mean_all, dist.ReduceOp.SUM)
+                var_all = (var + (mean - mean_all) ** 2) * ctx.factor
+                dist.all_reduce(var_all, dist.ReduceOp.SUM)
+                mean = mean_all
+                var = var_all
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        ctx.mark_non_differentiable(running_mean, running_var)
+        return x, running_mean, running_var
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz, _drunning_mean, _drunning_var):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+        # Undo activation
+        _act_backward(ctx, z, dz)
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+            edz_local = edz.clone()
+            eydz_local = eydz.clone()
+            if ctx.world_size > 1:
+                edz *= ctx.factor
+                dist.all_reduce(edz, dist.ReduceOp.SUM)
+                eydz *= ctx.factor
+                dist.all_reduce(eydz, dist.ReduceOp.SUM)
+        else:
+            edz_local = edz = dz.new_zeros(dz.size(1))
+            eydz_local = eydz = dz.new_zeros(dz.size(1))
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        # dweight = eydz_local * weight.sign() if ctx.affine else None
+        dweight = eydz_local if ctx.affine else None
+        if dweight is not None:
+            dweight[weight < 0] *= -1
+        dbias = edz_local if ctx.affine else None
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+inplace_abn = InPlaceABN.apply
+inplace_abn_sync = InPlaceABNSync.apply
+__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/misc.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch.nn as nn
+import torch
+import torch.distributed as dist
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        """Global average pooling over the input's spatial dimensions"""
+        super(GlobalAvgPool2d, self).__init__()
+    def forward(self, inputs):
+        in_size = inputs.size()
+        return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
+class SingleGPU(nn.Module):
+    def __init__(self, module):
+        super(SingleGPU, self).__init__()
+        self.module=module
+    def forward(self, input):
+        return self.module(input.cuda(non_blocking=True))

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/residual.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from collections import OrderedDict
+import torch.nn as nn
+from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
+import torch.nn.functional as functional
+class ResidualBlock(nn.Module):
+    """Configurable residual block
+    Parameters
+    ----------
+    in_channels : int
+        Number of input channels.
+    channels : list of int
+        Number of channels in the internal feature maps. Can either have two or three elements: if three construct
+        a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
+        `3 x 3` then `1 x 1` convolutions.
+    stride : int
+        Stride of the first `3 x 3` convolution
+    dilation : int
+        Dilation to apply to the `3 x 3` convolutions.
+    groups : int
+        Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
+        bottleneck blocks.
+    norm_act : callable
+        Function to create normalization / activation Module.
+    dropout: callable
+        Function to create Dropout Module.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 norm_act=ABN,
+                 dropout=None):
+        super(ResidualBlock, self).__init__()
+        # Check parameters for inconsistencies
+        if len(channels) != 2 and len(channels) != 3:
+            raise ValueError("channels must contain either two or three values")
+        if len(channels) == 2 and groups != 1:
+            raise ValueError("groups > 1 are only valid if len(channels) == 3")
+        is_bottleneck = len(channels) == 3
+        need_proj_conv = stride != 1 or in_channels != channels[-1]
+        if not is_bottleneck:
+            bn2 = norm_act(channels[1])
+            bn2.activation = ACT_NONE
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn1", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn2", bn2)
+            ]
+            if dropout is not None:
+                layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
+        else:
+            bn3 = norm_act(channels[2])
+            bn3.activation = ACT_NONE
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
+                ("bn1", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
+                                    groups=groups, dilation=dilation)),
+                ("bn2", norm_act(channels[1])),
+                ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
+                ("bn3", bn3)
+            ]
+            if dropout is not None:
+                layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
+        self.convs = nn.Sequential(OrderedDict(layers))
+        if need_proj_conv:
+            self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
+            self.proj_bn = norm_act(channels[-1])
+            self.proj_bn.activation = ACT_NONE
+    def forward(self, x):
+        if hasattr(self, "proj_conv"):
+            residual = self.proj_conv(x)
+            residual = self.proj_bn(residual)
+        else:
+            residual = x
+        x = self.convs(x) + residual
+        if self.convs.bn1.activation == ACT_LEAKY_RELU:
+            return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
+        elif self.convs.bn1.activation == ACT_ELU:
+            return functional.elu(x, inplace=True)
+        else:
+            return x
+class IdentityResidualBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 norm_act=ABN,
+                 dropout=None):
+        """Configurable identity-mapping residual block
+        Parameters
+        ----------
+        in_channels : int
+            Number of input channels.
+        channels : list of int
+            Number of channels in the internal feature maps. Can either have two or three elements: if three construct
+            a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
+            `3 x 3` then `1 x 1` convolutions.
+        stride : int
+            Stride of the first `3 x 3` convolution
+        dilation : int
+            Dilation to apply to the `3 x 3` convolutions.
+        groups : int
+            Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
+            bottleneck blocks.
+        norm_act : callable
+            Function to create normalization / activation Module.
+        dropout: callable
+            Function to create Dropout Module.
+        """
+        super(IdentityResidualBlock, self).__init__()
+        # Check parameters for inconsistencies
+        if len(channels) != 2 and len(channels) != 3:
+            raise ValueError("channels must contain either two or three values")
+        if len(channels) == 2 and groups != 1:
+            raise ValueError("groups > 1 are only valid if len(channels) == 3")
+        is_bottleneck = len(channels) == 3
+        need_proj_conv = stride != 1 or in_channels != channels[-1]
+        self.bn1 = norm_act(in_channels)
+        if not is_bottleneck:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    dilation=dilation))
+            ]
+            if dropout is not None:
+                layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
+        else:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    groups=groups, dilation=dilation)),
+                ("bn3", norm_act(channels[1])),
+                ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
+            ]
+            if dropout is not None:
+                layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
+        self.convs = nn.Sequential(OrderedDict(layers))
+        if need_proj_conv:
+            self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
+    def forward(self, x):
+        if hasattr(self, "proj_conv"):
+            bn1 = self.bn1(x)
+            shortcut = self.proj_conv(bn1)
+        else:
+            shortcut = x.clone()
+            bn1 = self.bn1(x)
+        out = self.convs(bn1)
+        out.add_(shortcut)
+        return out

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/checks.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#pragma once
+#include <ATen/ATen.h>
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn.cpp ADDED Viewed

	@@ -0,0 +1,95 @@

+#include <torch/extension.h>
+#include <vector>
+#include "inplace_abn.h"
+std::vector<at::Tensor> mean_var(at::Tensor x) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return mean_var_cuda_h(x);
+    } else {
+      return mean_var_cuda(x);
+    }
+  } else {
+    return mean_var_cpu(x);
+  }
+}
+at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                   bool affine, float eps) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
+    } else {
+      return forward_cuda(x, mean, var, weight, bias, affine, eps);
+    }
+  } else {
+    return forward_cpu(x, mean, var, weight, bias, affine, eps);
+  }
+}
+std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                 bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
+    } else {
+      return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
+	}
+  } else {
+    return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
+  }
+}
+at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                 at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
+	} else {
+      return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
+    }
+  } else {
+    return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
+  }
+}
+void leaky_relu_forward(at::Tensor z, float slope) {
+  at::leaky_relu_(z, slope);
+}
+void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return leaky_relu_backward_cuda_h(z, dz, slope);
+	} else {
+      return leaky_relu_backward_cuda(z, dz, slope);
+    }
+  } else {
+    return leaky_relu_backward_cpu(z, dz, slope);
+  }
+}
+void elu_forward(at::Tensor z) {
+  at::elu_(z);
+}
+void elu_backward(at::Tensor z, at::Tensor dz) {
+  if (z.is_cuda()) {
+    return elu_backward_cuda(z, dz);
+  } else {
+    return elu_backward_cpu(z, dz);
+  }
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("mean_var", &mean_var, "Mean and variance computation");
+  m.def("forward", &forward, "In-place forward computation");
+  m.def("edz_eydz", &edz_eydz, "First part of backward computation");
+  m.def("backward", &backward, "Second part of backward computation");
+  m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
+  m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
+  m.def("elu_forward", &elu_forward, "Elu forward computation");
+  m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
+}

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn.h ADDED Viewed

	@@ -0,0 +1,88 @@

+#pragma once
+#include <ATen/ATen.h>
+#include <vector>
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps);
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps);
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                          bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                        bool affine, float eps);
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                        at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
+void elu_backward_cpu(at::Tensor z, at::Tensor dz);
+void elu_backward_cuda(at::Tensor z, at::Tensor dz);
+static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
+  num = x.size(0);
+  chn = x.size(1);
+  sp = 1;
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    sp *= x.size(i);
+}
+/*
+ * Specialized CUDA reduction functions for BN
+ */
+#ifdef __CUDACC__
+#include "utils/cuda.cuh"
+template <typename T, typename Op>
+__device__ T reduce(Op op, int plane, int N, int S) {
+  T sum = (T)0;
+  for (int batch = 0; batch < N; ++batch) {
+    for (int x = threadIdx.x; x < S; x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    shared[threadIdx.x / WARP_SIZE] = sum;
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T)0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+#endif

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp ADDED Viewed

	@@ -0,0 +1,119 @@

+#include <ATen/ATen.h>
+#include <vector>
+#include "utils/checks.h"
+#include "inplace_abn.h"
+at::Tensor reduce_sum(at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return x.sum(0);
+  } else {
+    auto x_view = x.view({x.size(0), x.size(1), -1});
+    return x_view.sum(-1).sum(0);
+  }
+}
+at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return v;
+  } else {
+    std::vector<int64_t> broadcast_size = {1, -1};
+    for (int64_t i = 2; i < x.ndimension(); ++i)
+      broadcast_size.push_back(1);
+    return v.view(broadcast_size);
+  }
+}
+int64_t count(at::Tensor x) {
+  int64_t count = x.size(0);
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    count *= x.size(i);
+  return count;
+}
+at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
+  if (affine) {
+    return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
+  } else {
+    return z;
+  }
+}
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
+  auto num = count(x);
+  auto mean = reduce_sum(x) / num;
+  auto diff = x - broadcast_to(mean, x);
+  auto var = reduce_sum(diff.pow(2)) / num;
+  return {mean, var};
+}
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps) {
+  auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
+  auto mul = at::rsqrt(var + eps) * gamma;
+  x.sub_(broadcast_to(mean, x));
+  x.mul_(broadcast_to(mul, x));
+  if (affine) x.add_(broadcast_to(bias, x));
+  return x;
+}
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps) {
+  auto edz = reduce_sum(dz);
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto eydz = reduce_sum(y * dz);
+  return {edz, eydz};
+}
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
+  auto num = count(z);
+  auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
+  return dx;
+}
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] *= 1 / slope;
+        _dz[i] *= slope;
+      }
+    }
+  }));
+}
+void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] = log1p(_z[i]);
+        _dz[i] *= (_z[i] + 1.f);
+      }
+    }
+  }));
+}

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu ADDED Viewed

	@@ -0,0 +1,333 @@

+#include <ATen/ATen.h>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+#include <vector>
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+#include <ATen/cuda/CUDAContext.h>
+// Operations for reduce
+template<typename T>
+struct SumOp {
+  __device__ SumOp(const T *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    return tensor[(batch * chn + plane) * sp + n];
+  }
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+template<typename T>
+struct VarOp {
+  __device__ VarOp(T m, const T *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    T val = tensor[(batch * chn + plane) * sp + n];
+    return (val - mean) * (val - mean);
+  }
+  const T mean;
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+template<typename T>
+struct GradOp {
+  __device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
+    T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
+    T _dz = dz[(batch * chn + plane) * sp + n];
+    return Pair<T>(_dz, _y * _dz);
+  }
+  const T weight;
+  const T bias;
+  const T *z;
+  const T *dz;
+  const int chn;
+  const int sp;
+};
+/***********
+ * mean_var
+ ***********/
+template<typename T>
+__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T norm = T(1) / T(num * sp);
+  T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Prepare output tensors
+  auto mean = at::empty({chn}, x.options());
+  auto var = at::empty({chn}, x.options());
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
+    mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        num, chn, sp);
+  }));
+  return {mean, var};
+}
+/**********
+ * forward
+ **********/
+template<typename T>
+__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
+                               bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _mean = mean[plane];
+  T _var = var[plane];
+  T _weight = affine ? abs(weight[plane]) + eps : T(1);
+  T _bias = affine ? bias[plane] : T(0);
+  T mul = rsqrt(_var + eps) * _weight;
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _x = x[(batch * chn + plane) * sp + n];
+      T _y = (_x - _mean) * mul + _bias;
+      x[(batch * chn + plane) * sp + n] = _y;
+    }
+  }
+}
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
+    forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return x;
+}
+/***********
+ * edz_eydz
+ ***********/
+template<typename T>
+__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
+                                T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+  Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto edz = at::empty({chn}, z.options());
+  auto eydz = at::empty({chn}, z.options());
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
+    edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return {edz, eydz};
+}
+/***********
+ * backward
+ ***********/
+template<typename T>
+__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
+	                        const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+  T _var = var[plane];
+  T _edz = edz[plane];
+  T _eydz = eydz[plane];
+  T _mul = _weight * rsqrt(_var + eps);
+  T count = T(num * sp);
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _dz = dz[(batch * chn + plane) * sp + n];
+      T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
+      dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
+    }
+  }
+}
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto dx = at::zeros_like(z);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
+    backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        dx.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return dx;
+}
+/**************
+ * activations
+ **************/
+template<typename T>
+inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_dz,
+                       [slope] __device__ (const T& dz) { return dz * slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [slope] __device__ (const T& z) { return z / slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
+  }));
+}
+template<typename T>
+inline void elu_backward_impl(T *z, T *dz, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_z, th_dz,
+                       [] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [] __device__ (const T& z) { return log1p(z); },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
+  }));
+}

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu ADDED Viewed

	@@ -0,0 +1,275 @@

+#include <ATen/ATen.h>
+#include <cuda_fp16.h>
+#include <vector>
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+#include <ATen/cuda/CUDAContext.h>
+// Operations for reduce
+struct SumOpH {
+  __device__ SumOpH(const half *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    return __half2float(tensor[(batch * chn + plane) * sp + n]);
+  }
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+struct VarOpH {
+  __device__ VarOpH(float m, const half *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
+    return (t - mean) * (t - mean);
+  }
+  const float mean;
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+struct GradOpH {
+  __device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
+    float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
+    float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+    return Pair<float>(_dz, _y * _dz);
+  }
+  const float weight;
+  const float bias;
+  const half *z;
+  const half *dz;
+  const int chn;
+  const int sp;
+};
+/***********
+ * mean_var
+ ***********/
+__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float norm = 1.f / static_cast<float>(num * sp);
+  float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Prepare output tensors
+  auto mean = at::empty({chn},x.options().dtype(at::kFloat));
+  auto var = at::empty({chn},x.options().dtype(at::kFloat));
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      num, chn, sp);
+  return {mean, var};
+}
+/**********
+ * forward
+ **********/
+__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
+                                 bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  const float _mean = mean[plane];
+  const float _var = var[plane];
+  const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  const float _bias = affine ? bias[plane] : 0.f;
+  const float mul = rsqrt(_var + eps) * _weight;
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      half *x_ptr = x + (batch * chn + plane) * sp + n;
+      float _x = __half2float(*x_ptr);
+      float _y = (_x - _mean) * mul + _bias;
+      *x_ptr = __float2half(_y);
+    }
+  }
+}
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  forward_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      weight.data<float>(),
+      bias.data<float>(),
+      affine, eps, num, chn, sp);
+  return x;
+}
+__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
+                                float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+  Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto edz = at::empty({chn},z.options().dtype(at::kFloat));
+  auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        affine, eps, num, chn, sp);
+  return {edz, eydz};
+}
+__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
+                                  const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+  float _var = var[plane];
+  float _edz = edz[plane];
+  float _eydz = eydz[plane];
+  float _mul = _weight * rsqrt(_var + eps);
+  float count = float(num * sp);
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+      float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
+      dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
+    }
+  }
+}
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto dx = at::zeros_like(z);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  backward_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        var.data<float>(),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        reinterpret_cast<half*>(dx.data<at::Half>()),
+        affine, eps, num, chn, sp);
+  return dx;
+}
+__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count;  i += blockDim.x * gridDim.x){
+    float _z = __half2float(z[i]);
+    if (_z < 0) {
+      dz[i] = __float2half(__half2float(dz[i]) * slope);
+      z[i] = __float2half(_z / slope);
+    }
+  }
+}
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  dim3 threads(getNumThreads(count));
+  dim3 blocks = (count + threads.x - 1) / threads.x;
+  auto stream = at::cuda::getCurrentCUDAStream();
+  leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(z.data<at::Half>()),
+      reinterpret_cast<half*>(dz.data<at::Half>()),
+      slope, count);
+}

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/utils/checks.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#pragma once
+#include <ATen/ATen.h>
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/utils/common.h ADDED Viewed

	@@ -0,0 +1,49 @@

+#pragma once
+#include <ATen/ATen.h>
+/*
+ * Functions to share code between CPU and GPU
+ */
+#ifdef __CUDACC__
+// CUDA versions
+#define HOST_DEVICE __host__ __device__
+#define INLINE_HOST_DEVICE __host__ __device__ inline
+#define FLOOR(x) floor(x)
+#if __CUDA_ARCH__ >= 600
+// Recent compute capabilities have block-level atomicAdd for all data types, so we use that
+#define ACCUM(x,y) atomicAdd_block(&(x),(y))
+#else
+// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
+// and use the known atomicCAS-based implementation for double
+template<typename data_t>
+__device__ inline data_t atomic_add(data_t *address, data_t val) {
+  return atomicAdd(address, val);
+}
+template<>
+__device__ inline double atomic_add(double *address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+#define ACCUM(x,y) atomic_add(&(x),(y))
+#endif // #if __CUDA_ARCH__ >= 600
+#else
+// CPU versions
+#define HOST_DEVICE
+#define INLINE_HOST_DEVICE inline
+#define FLOOR(x) std::floor(x)
+#define ACCUM(x,y) (x) += (y)
+#endif // #ifdef __CUDACC__

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/modules/src/utils/cuda.cuh ADDED Viewed

	@@ -0,0 +1,71 @@

+#pragma once
+/*
+ * General settings and functions
+ */
+const int WARP_SIZE = 32;
+const int MAX_BLOCK_SIZE = 1024;
+static int getNumThreads(int nElem) {
+  int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
+  for (int i = 0; i < 6; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+/*
+ * Reduction utilities
+ */
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
+                                           unsigned int mask = 0xffffffff) {
+#if CUDART_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
+template<typename T>
+struct Pair {
+  T v1, v2;
+  __device__ Pair() {}
+  __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
+  __device__ Pair(T v) : v1(v), v2(v) {}
+  __device__ Pair(int v) : v1(v), v2(v) {}
+  __device__ Pair &operator+=(const Pair<T> &a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+};
+template<typename T>
+static __device__ __forceinline__ T warpSum(T val) {
+#if __CUDA_ARCH__ >= 300
+  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+    val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
+  }
+#else
+  __shared__ T values[MAX_BLOCK_SIZE];
+  values[threadIdx.x] = val;
+  __threadfence_block();
+  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+  for (int i = 1; i < WARP_SIZE; i++) {
+    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+  }
+#endif
+  return val;
+}
+template<typename T>
+static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
+  value.v1 = warpSum(value.v1);
+  value.v2 = warpSum(value.v2);
+  return value;
+}

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/AugmentCE2P.py ADDED Viewed

	@@ -0,0 +1,388 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   AugmentCE2P.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import pdb
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+# Note here we adopt the InplaceABNSync implementation from https://github.com/mapillary/inplace_abn
+# By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer
+from modules import InPlaceABNSync
+import numpy as np
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+affine_par = True
+pretrained_settings = {
+    'resnet101': {
+        'imagenet': {
+            'input_space': 'BGR',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.406, 0.456, 0.485],
+            'std': [0.225, 0.224, 0.229],
+            'num_classes': 1000
+        }
+    },
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out = out + residual
+        out = self.relu_inplace(out)
+        return out
+class CostomAdaptiveAvgPool2D(nn.Module):
+    def __init__(self, output_size):
+        super(CostomAdaptiveAvgPool2D, self).__init__()
+        self.output_size = output_size
+    def forward(self, x):
+        H_in, W_in = x.shape[-2:]
+        H_out, W_out = self.output_size
+        out_i = []
+        for i in range(H_out):
+            out_j = []
+            for j in range(W_out):
+                hs = int(np.floor(i * H_in / H_out))
+                he = int(np.ceil((i + 1) * H_in / H_out))
+                ws = int(np.floor(j * W_in / W_out))
+                we = int(np.ceil((j + 1) * W_in / W_out))
+                # print(hs, he, ws, we)
+                kernel_size = [he - hs, we - ws]
+                out = F.avg_pool2d(x[:, :, hs:he, ws:we], kernel_size)
+                out_j.append(out)
+            out_j = torch.concat(out_j, -1)
+            out_i.append(out_j)
+        out_i = torch.concat(out_i, -2)
+        return out_i
+class PSPModule(nn.Module):
+    """
+    Reference:
+        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
+    """
+    def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
+        super(PSPModule, self).__init__()
+        self.stages = []
+        tmp = []
+        for size in sizes:
+            if size == 3 or size == 6:
+                tmp.append(self._make_stage_custom(features, out_features, size))
+            else:
+                tmp.append(self._make_stage(features, out_features, size))
+        self.stages = nn.ModuleList(tmp)
+        # self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
+                      bias=False),
+            InPlaceABNSync(out_features),
+        )
+    def _make_stage(self, features, out_features, size):
+        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+    def _make_stage_custom(self, features, out_features, size):
+        prior = CostomAdaptiveAvgPool2D(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
+                  self.stages] + [feats]
+        bottle = self.bottleneck(torch.cat(priors, 1))
+        return bottle
+class ASPPModule(nn.Module):
+    """
+    Reference:
+        Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
+    """
+    def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                   nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
+                                             bias=False),
+                                   InPlaceABNSync(inner_features))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(inner_features))
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+        bottle = self.bottleneck(out)
+        return bottle
+class Edge_Module(nn.Module):
+    """
+    Edge Learning Branch
+    """
+    def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2):
+        super(Edge_Module, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
+        self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
+    def forward(self, x1, x2, x3):
+        _, _, h, w = x1.size()
+        edge1_fea = self.conv1(x1)
+        edge1 = self.conv4(edge1_fea)
+        edge2_fea = self.conv2(x2)
+        edge2 = self.conv4(edge2_fea)
+        edge3_fea = self.conv3(x3)
+        edge3 = self.conv4(edge3_fea)
+        edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True)
+        edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True)
+        edge = torch.cat([edge1, edge2, edge3], dim=1)
+        edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
+        edge = self.conv5(edge)
+        return edge, edge_fea
+class Decoder_Module(nn.Module):
+    """
+    Parsing Branch Decoder Module.
+    """
+    def __init__(self, num_classes):
+        super(Decoder_Module, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(48)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+        self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+    def forward(self, xt, xl):
+        _, _, h, w = xl.size()
+        xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
+        xl = self.conv2(xl)
+        x = torch.cat([xt, xl], dim=1)
+        x = self.conv3(x)
+        seg = self.conv4(x)
+        return seg, x
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=False)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=False)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=False)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1))
+        self.context_encoding = PSPModule(2048, 512)
+        self.edge = Edge_Module()
+        self.decoder = Decoder_Module(num_classes)
+        self.fushion = nn.Sequential(
+            nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Dropout2d(0.1),
+            nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+        )
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion, affine=affine_par))
+        layers = []
+        generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1
+        layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample,
+                            multi_grid=generate_multi_grid(0, multi_grid)))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x2 = self.layer1(x)
+        x3 = self.layer2(x2)
+        x4 = self.layer3(x3)
+        x5 = self.layer4(x4)
+        x = self.context_encoding(x5)
+        parsing_result, parsing_fea = self.decoder(x, x2)
+        # Edge Branch
+        edge_result, edge_fea = self.edge(x2, x3, x4)
+        # Fusion Branch
+        x = torch.cat([parsing_fea, edge_fea], dim=1)
+        fusion_result = self.fushion(x)
+        return [[parsing_result, fusion_result], edge_result]
+def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'):
+    model.input_space = settings['input_space']
+    model.input_size = settings['input_size']
+    model.input_range = settings['input_range']
+    model.mean = settings['mean']
+    model.std = settings['std']
+    if pretrained is not None:
+        saved_state_dict = torch.load(pretrained)
+        new_params = model.state_dict().copy()
+        for i in saved_state_dict:
+            i_parts = i.split('.')
+            if not i_parts[0] == 'fc':
+                new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
+        model.load_state_dict(new_params)
+def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'):
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
+    settings = pretrained_settings['resnet101']['imagenet']
+    initialize_pretrained_model(model, settings, pretrained)
+    return model

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from __future__ import absolute_import
+from networks.AugmentCE2P import resnet101
+__factory = {
+    'resnet101': resnet101,
+}
+def init_model(name, *args, **kwargs):
+    if name not in __factory.keys():
+        raise KeyError("Unknown model arch: {}".format(name))
+    return __factory[name](*args, **kwargs)

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/backbone/mobilenetv2.py ADDED Viewed

	@@ -0,0 +1,156 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   mobilenetv2.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch.nn as nn
+import math
+import functools
+from modules import InPlaceABN, InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+__all__ = ['mobilenetv2']
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+        if expand_ratio == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                BatchNorm2d(oup),
+            )
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],  # layer 2
+            [6, 32, 3, 2],  # layer 3
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],  # layer 4
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],  # layer 5
+        ]
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
+                else:
+                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, n_class),
+        )
+        self._initialize_weights()
+    def forward(self, x):
+        x = self.features(x)
+        x = x.mean(3).mean(2)
+        x = self.classifier(x)
+        return x
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+def mobilenetv2(pretrained=False, **kwargs):
+    """Constructs a MobileNet_V2 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = MobileNetV2(n_class=1000, **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['mobilenetv2']), strict=False)
+    return model

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/backbone/resnet.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   resnet.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch.nn as nn
+import math
+from torch.utils.model_zoo import load_url
+from modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+__all__ = ['ResNet', 'resnet18', 'resnet50', 'resnet101']  # resnet101 is coming soon!
+model_urls = {
+    'resnet18': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet18-imagenet.pth',
+    'resnet50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet50-imagenet.pth',
+    'resnet101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet101-imagenet.pth'
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet18']))
+    return model
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet50']), strict=False)
+    return model
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet101']), strict=False)
+    return model

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/backbone/resnext.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   resnext.py.py
+@Time    :   8/11/19 8:58 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch.nn as nn
+import math
+from torch.utils.model_zoo import load_url
+from modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+__all__ = ['ResNeXt', 'resnext101']  # support resnext 101
+model_urls = {
+    'resnext50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext50-imagenet.pth',
+    'resnext101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext101-imagenet.pth'
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class GroupBottleneck(nn.Module):
+    expansion = 2
+    def __init__(self, inplanes, planes, stride=1, groups=1, downsample=None):
+        super(GroupBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, groups=groups, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 2, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 2)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNeXt(nn.Module):
+    def __init__(self, block, layers, groups=32, num_classes=1000):
+        self.inplanes = 128
+        super(ResNeXt, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 128, layers[0], groups=groups)
+        self.layer2 = self._make_layer(block, 256, layers[1], stride=2, groups=groups)
+        self.layer3 = self._make_layer(block, 512, layers[2], stride=2, groups=groups)
+        self.layer4 = self._make_layer(block, 1024, layers[3], stride=2, groups=groups)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(1024 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels // m.groups
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1, groups=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, groups, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=groups))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+def resnext101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on Places
+    """
+    model = ResNeXt(GroupBottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnext101']), strict=False)
+    return model

src/multiview_consist_edit/parse_tool/preprocess/humanparsing/networks/context_encoding/aspp.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   aspp.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from modules import InPlaceABNSync
+class ASPPModule(nn.Module):
+    """
+    Reference:
+        Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
+    """
+    def __init__(self, features, out_features=512, inner_features=256, dilations=(12, 24, 36)):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                   nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
+                                             bias=False),
+                                   InPlaceABNSync(inner_features))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(inner_features))
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+        bottle = self.bottleneck(out)
+        return bottle