UniBioTransfer

Runtime error

App Files Files Community

scy639 commited on Apr 23

Commit

8c90776

verified ·

1 Parent(s): dcab600

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +9 -0
Dataset_custom.py +317 -0
LICENSE +23 -0
LatentDiffusion.yaml +83 -0
Mediapipe_Result_Cache.py +36 -0
MoE.py +141 -0
Other_dependencies/arcface/add.txt +1 -0
Other_dependencies/arcface/model_ir_se50.pth +3 -0
Other_dependencies/face_parsing/79999_iter.pth +3 -0
Other_dependencies/face_parsing/add.txt +1 -0
Other_dependencies/mp_models/blaze_face_short_range.tflite +3 -0
Other_dependencies/mp_models/face_landmarker_v2_with_blendshapes.task +3 -0
app.py +239 -0
checkpoints/pretrained.json +1072 -0
download_checkpoints.py +29 -0
eval_tool/lpips/__init__.py +0 -0
eval_tool/lpips/lpips.py +35 -0
eval_tool/lpips/networks.py +96 -0
eval_tool/lpips/utils.py +30 -0
examples/face/ref-semantic_mask.png +0 -0
examples/face/ref.png +3 -0
examples/face/tgt-semantic_mask.png +0 -0
examples/face/tgt.png +3 -0
examples/hair/ref-semantic_mask.png +0 -0
examples/hair/ref.png +3 -0
examples/hair/tgt-semantic_mask.png +0 -0
examples/hair/tgt.png +3 -0
examples/head/ref-semantic_mask.png +0 -0
examples/head/ref.png +3 -0
examples/head/tgt-semantic_mask.png +0 -0
examples/head/tgt.png +3 -0
examples/inputs.txt +5 -0
examples/motion/ref-semantic_mask.png +0 -0
examples/motion/ref.png +3 -0
examples/motion/tgt-semantic_mask.png +0 -0
examples/motion/tgt.png +3 -0
gen_lmk_and_mask.py +41 -0
gen_semantic_mask.py +90 -0
get_mask.py +68 -0
global_.py +9 -0
hf_model.py +247 -0
imports.py +8 -0
infer.py +366 -0
infer_hf.py +279 -0
init_model.py +178 -0
ldm/lr_scheduler.py +99 -0
ldm/models/autoencoder.py +443 -0
ldm/models/diffusion/__init__.py +0 -0
ldm/models/diffusion/bank.py +76 -0
ldm/models/diffusion/classifier.py +267 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Other_dependencies/mp_models/face_landmarker_v2_with_blendshapes.task filter=lfs diff=lfs merge=lfs -text
+examples/face/ref.png filter=lfs diff=lfs merge=lfs -text
+examples/face/tgt.png filter=lfs diff=lfs merge=lfs -text
+examples/hair/ref.png filter=lfs diff=lfs merge=lfs -text
+examples/hair/tgt.png filter=lfs diff=lfs merge=lfs -text
+examples/head/ref.png filter=lfs diff=lfs merge=lfs -text
+examples/head/tgt.png filter=lfs diff=lfs merge=lfs -text
+examples/motion/ref.png filter=lfs diff=lfs merge=lfs -text
+examples/motion/tgt.png filter=lfs diff=lfs merge=lfs -text

Dataset_custom.py ADDED Viewed

	@@ -0,0 +1,317 @@

+from imports import *
+from pathlib import Path
+import numpy as np
+import cv2
+from PIL import Image
+import torch
+import torch.utils.data as data
+import torchvision.transforms as T
+from einops import rearrange
+import albumentations
+from util_face import *
+from util_4dataset import *
+from util_cv2 import cv2_resize_auto_interpolation
+from Mediapipe_Result_Cache import Mediapipe_Result_Cache
+def resize_A(img, dataset_name, size=(512, 512), interpolation=None):
+    is_pil = isinstance(img, Image.Image)
+    if is_pil:
+        img = np.array(img)
+    if img.shape[:2] != (512, 512):
+        img = cv2_resize_auto_interpolation(img, size, interpolation=interpolation)
+    if is_pil:
+        img = Image.fromarray(img)
+    return img
+def un_norm_clip(x1):
+    x = x1 * 1.0
+    reduce = False
+    if len(x.shape) == 3:
+        x = x.unsqueeze(0)
+        reduce = True
+    x[:, 0, :, :] = x[:, 0, :, :] * 0.26862954 + 0.48145466
+    x[:, 1, :, :] = x[:, 1, :, :] * 0.26130258 + 0.4578275
+    x[:, 2, :, :] = x[:, 2, :, :] * 0.27577711 + 0.40821073
+    if reduce:
+        x = x.squeeze(0)
+    return x
+def un_norm(x):
+    return (x + 1.0) / 2.0
+def _dilate(_mask, kernel_size, iterations):
+    _mask = _mask.astype(np.uint8)
+    kernel = np.ones((kernel_size, kernel_size), np.uint8)
+    _mask = cv2.dilate(_mask, kernel, iterations=iterations)
+    _mask = _mask.astype(bool)
+    return _mask
+def dilate_4_task0(sm_mask):
+    sm_mask = np.array(sm_mask)
+    preserve1 = [2, 3, 10, 5]
+    mask1 = np.isin(sm_mask, preserve1)
+    mask1 = _dilate(mask1, 7, 1)
+    preserve2 = [3, 10]
+    mask2 = np.isin(sm_mask, preserve2)
+    mask2 = _dilate(mask2, 10, 3)
+    preserve3 = [1]
+    mask3 = np.isin(sm_mask, preserve3)
+    mask3 = _dilate(mask3, 7, 2)
+    mask = mask1 | mask2 | mask3
+    return mask
+class Dataset_custom(data.Dataset):
+    mean = (0.5, 0.5, 0.5)
+    std = (0.5, 0.5, 0.5)
+    def get_img4clip(
+        self,
+        img,
+        sm_mask,
+        preserve,
+        for_clip=True,
+        add_semantic_head=False,
+        mask_after_npisin=None,
+        for_inpaint512=False,
+    ):
+        sm_mask = np.array(sm_mask)
+        if mask_after_npisin is None:
+            if self.task == 0 and 0:
+                mask = dilate_4_task0(sm_mask)
+            else:
+                mask = np.isin(sm_mask, preserve)
+                if self.task == 0 and 1 and for_inpaint512:
+                    forehead_mask = get_forehead_mask(sm_mask)
+                    mask = mask & ~forehead_mask
+        else:
+            mask = mask_after_npisin
+        if isinstance(img, np.ndarray):
+            img = Image.fromarray(img)
+        if add_semantic_head:
+            mask_before_colorSM = mask
+            img, mask = add_colorSM(img, sm_mask, preserve, None)
+        mask = mask_after_npisin__2__tensor(mask)
+        if for_clip:
+            image_tensor = get_tensor_clip()(img)
+        else:
+            image_tensor = get_tensor(mean=self.mean, std=self.std)(img)
+        image_tensor = T.Resize([512, 512])(image_tensor)
+        image_tensor = image_tensor * mask
+        if for_clip:
+            image_tensor = 255.0 * rearrange(un_norm_clip(image_tensor), "c h w -> h w c").cpu().numpy()
+            _size = 224
+        else:
+            image_tensor = 255.0 * rearrange(un_norm(image_tensor), "c h w -> h w c").cpu().numpy()
+            _size = 512
+        image_tensor = albumentations.Resize(height=_size, width=_size)(image=image_tensor)
+        image_tensor = Image.fromarray(image_tensor["image"].astype(np.uint8))
+        if for_clip:
+            image_tensor = get_tensor_clip()(image_tensor)
+        else:
+            image_tensor = get_tensor(mean=self.mean, std=self.std)(image_tensor)
+            image_tensor = image_tensor * mask
+        if add_semantic_head:
+            mask = mask_after_npisin__2__tensor(mask_before_colorSM)
+        return image_tensor, mask
+    def __init__(
+        self,
+        state,
+        task,
+        paths_tgt,
+        paths_ref,
+        name="custom",
+    ):
+        if task == 0:
+            USE_filter_mediapipe_fail_swap = 1
+            USE_pts = 1
+            READ_mediapipe_result_from_cache = 1
+        elif task == 1:
+            USE_filter_mediapipe_fail_swap = 0
+            USE_pts = 0
+            READ_mediapipe_result_from_cache = 1
+        elif task == 2:
+            USE_filter_mediapipe_fail_swap = 1
+            USE_pts = 1
+            READ_mediapipe_result_from_cache = 1
+        elif task == 3:
+            USE_filter_mediapipe_fail_swap = 0
+            USE_pts = 1
+            READ_mediapipe_result_from_cache = 1
+        self.READ_mediapipe_result_from_cache = READ_mediapipe_result_from_cache
+        assert state == "test"
+        self.state = state
+        self.image_size = 512
+        self.kernel = np.ones((1, 1), np.uint8)
+        self.name = name
+        assert paths_tgt is not None and paths_ref is not None, "paths_tgt and paths_ref are required"
+        assert len(paths_tgt) == len(paths_ref), "paths_tgt and paths_ref must be the same length"
+        self.paths_tgt = list(paths_tgt)
+        self.paths_ref = list(paths_ref)
+        if READ_mediapipe_result_from_cache:
+            self.mediapipe_Result_Cache = Mediapipe_Result_Cache()
+        self.task = task
+    def __getitem__(self, index):
+        task = self.task
+        path_tgt = self.paths_tgt[index]
+        path_ref = self.paths_ref[index]
+        img_tgt = Image.open(path_tgt).convert("RGB")
+        img_tgt = resize_A(img_tgt, self.name)
+        mask_path = path_img_2_path_mask(path_tgt)
+        if self.task == 0:
+            preserve = [1, 2, 3, 10, 5, 6, 7, 9]
+            if 0:
+                preserve = [1, 2, 3, 10, 5]
+            sm_mask_tgt = Image.open(mask_path).convert("L")
+            sm_mask_tgt = np.array(sm_mask_tgt)
+            if 0:
+                mask_tgt = dilate_4_task0(sm_mask_tgt)
+            else:
+                mask_tgt = np.isin(sm_mask_tgt, preserve)
+                if self.task == 0 and 1:
+                    forehead_mask = get_forehead_mask(sm_mask_tgt)
+                    mask_tgt = mask_tgt & ~forehead_mask
+        elif self.task == 1:
+            preserve = [4]
+            mask_tgt = path_img_2_mask(path_tgt, preserve)
+        elif self.task == 3:
+            preserve = [1, 2, 3, 10, 4, 5, 6, 7, 9]
+            mask_tgt = path_img_2_mask(path_tgt, preserve)
+        elif self.task == 2:
+            preserve = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21]
+            sm_mask_tgt = Image.open(mask_path).convert("L")
+            sm_mask_tgt = np.array(sm_mask_tgt)
+            mask_tgt = np.isin(sm_mask_tgt, preserve)
+        converted_mask = np.zeros_like(mask_tgt)
+        converted_mask[mask_tgt] = 255
+        mask_tgt = Image.fromarray(converted_mask).convert("L")
+        mask_tensor = 1 - get_tensor(normalize=False, toTensor=True)(mask_tgt)
+        image_tensor = get_tensor(mean=self.mean, std=self.std)(img_tgt)
+        image_tensor_resize = T.Resize([self.image_size, self.image_size])(image_tensor)
+        mask_tensor_resize = T.Resize([self.image_size, self.image_size])(mask_tensor)
+        if task == 2:
+            inpaint_tensor_resize = image_tensor_resize
+        else:
+            inpaint_tensor_resize = image_tensor_resize * mask_tensor_resize
+        if 1:
+            mask_tensor_resize = 1 - mask_tensor_resize
+        if 1:
+            mask_path_ref = path_img_2_path_mask(path_ref)
+            sm_mask_ref = Image.open(mask_path_ref).convert("L")
+            sm_mask_ref = np.array(sm_mask_ref)
+            img_ref = cv2.imread(str(path_ref))
+            img_ref = cv2.cvtColor(img_ref, cv2.COLOR_BGR2RGB)
+            img_ref = resize_A(img_ref, self.name)
+        if task != 2:
+            ref_image_tensor, ref_mask_tensor = self.get_img4clip(
+                img_ref, sm_mask_ref, preserve, for_clip=True, add_semantic_head=0
+            )
+            if task == 3:
+                ref_image_faceOnly_tensor, _ = self.get_img4clip(
+                    img_ref,
+                    sm_mask_ref,
+                    [1, 2, 3, 10, 5, 6, 7, 9],
+                    for_clip=False,
+                    add_semantic_head=0,
+                )
+        else:
+            ref_image_tensor = inpaint_tensor_resize
+        ret = {
+            "inpaint_image": inpaint_tensor_resize,
+            "inpaint_mask": mask_tensor_resize,
+            "ref_imgs": ref_image_tensor,
+            "task": self.task,
+        }
+        if self.task == 0:
+            ret["enInputs"] = {
+                "face_ID-in": ref_image_tensor,
+                "face-clip-in": ref_image_tensor,
+            }
+        elif self.task == 1:
+            ret["enInputs"] = {
+                "hair-clip-in": ref_image_tensor,
+            }
+        elif self.task == 2:
+            tgt_nonBg_tensor, _ = self.get_img4clip(img_tgt, sm_mask_tgt, preserve)
+            ret["enInputs"] = {
+                "face_ID-in": tgt_nonBg_tensor,
+                "head-clip-in": tgt_nonBg_tensor,
+            }
+        elif self.task == 3:
+            ret["enInputs"] = {
+                "face_ID-in": ref_image_faceOnly_tensor,
+                "head-clip-in": ref_image_tensor,
+            }
+        if (REFNET.ENABLE and REFNET.task2layerNum[task] > 0) or CH14:
+            if task != 2:
+                ref_imgs_4unet, ref_mask_4unet = self.get_img4clip(
+                    img_ref, sm_mask_ref, preserve, for_clip=False, add_semantic_head=0
+                )
+            else:
+                ref_imgs_4unet, ref_mask_4unet = self.get_img4clip(
+                    img_tgt,
+                    sm_mask_tgt,
+                    "any",
+                    for_clip=False,
+                    add_semantic_head=0,
+                    mask_after_npisin=np.ones_like(sm_mask_tgt).astype(bool),
+                )
+            ref_imgs_4unet = T.Resize([self.image_size, self.image_size])(ref_imgs_4unet)
+            ref_mask_512 = T.Resize([self.image_size, self.image_size])(ref_mask_4unet)
+            ret["ref_imgs_4unet"] = ref_imgs_4unet
+            ret["ref_mask_512"] = ref_mask_512
+        if self.READ_mediapipe_result_from_cache:
+            if self.state == "test":
+                if task == 2:
+                    _p_lmk = path_ref
+                else:
+                    _p_lmk = path_tgt
+            else:
+                _p_lmk = path_tgt
+            ret["mediapipe_lmkAll"] = self.mediapipe_Result_Cache.get(_p_lmk)
+            if ret["mediapipe_lmkAll"] is None:
+                raise RuntimeError(
+                    f"Missing Mediapipe cache for input image: {_p_lmk}. "
+                    "Precompute landmarks and ensure cache exists before inference."
+                )
+        if self.state == "test":
+            prior_image_tensor = "None"
+            out_stem = f"{Path(path_tgt).stem}-{Path(path_ref).stem}"
+            if task == 2:
+                ref512, _ = self.get_img4clip(
+                    img_ref, sm_mask_ref, preserve, for_clip=False, add_semantic_head=0
+                )
+                ref512 = T.Resize([self.image_size, self.image_size])(ref512)
+                ret["ref512"] = ref512
+            ret = (image_tensor_resize, prior_image_tensor, ret, out_stem)
+        return ret
+    def __len__(self):
+        return len(self.paths_tgt)

LICENSE ADDED Viewed

	@@ -0,0 +1,23 @@

+MIT License
+Copyright (c) 2024 Sanoojan
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LatentDiffusion.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+model:
+  base_learning_rate: 4.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "inpaint"
+    cond_stage_key: "image"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: true   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    u_cond_percent: 0.2
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-1 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+        add_conv_in_front_of_unet: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+      other_params:
+        clip_weight: 1.0
+        arcface_path: "Other_dependencies/arcface/model_ir_se50.pth"
+        multi_scale_ID: False # True was used for the previous training there is an issue
+        Additional_config:
+          Reconstruct_initial: False # scy:
+          Target_CLIP_feat: True
+          Source_CLIP_feat: True
+          Reconstruct_DDIM_steps: 4

Mediapipe_Result_Cache.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from imports import *
+import json,random,os
+import numpy as np
+class Mediapipe_Result_Cache:
+    """
+    Convention: when a cache entry exists, it must not be None.
+    In other words, None results should not be cached; get/set guard against historical None values.
+    """
+    # DIR = Path('/inspurfs/group/mayuexin/suncy/mediapipe_result/A')
+    DIR = Path('data/mediapipe_result')
+    def __init__(self):
+        pass
+    def get_path(self, img_path):
+        img_path = Path(img_path)
+        str_img_folder = str(img_path.parent)
+        assert '|' not in str_img_folder
+        str_img_folder = str_img_folder.replace('/', '|')
+        lmk_folder = self.DIR / str_img_folder
+        lmk_folder.mkdir(parents=1, exist_ok=True)
+        ret= lmk_folder / (img_path.name+'.npy')
+        return ret
+    def get(self, img_path):
+        path = self.get_path(img_path)
+        # print(f"[get] {path=}")
+        if path.exists():
+            ret = np.load(path)
+            assert ret is not None
+            return ret
+    def set(self, img_path, lmks):
+        assert lmks is not None
+        path = self.get_path(img_path)
+        np.save(path, lmks)
+        # print(f"{path=}")

MoE.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from imports import *
+import global_
+import torch,copy
+import torch.nn as nn
+from ldm.modules.attention import FeedForward,CrossAttention
+from ldm.modules.diffusionmodules.openaimodel import UNetModel,ResBlock,TimestepEmbedSequential
+# import torch.nn.functional as F
+# ---------------- Configs ----------------
+CONV2D_PARAM_STATS = []
+def average_module_weight(src_modules: list):
+    """Average the weights of multiple modules (similar to init_model.py)."""
+    if not src_modules:
+        return None
+    avg_state_dict = {}
+    first_state_dict = src_modules[0].state_dict()
+    for key in first_state_dict:
+        avg_state_dict[key] = torch.zeros_like(first_state_dict[key])
+    for module in src_modules:
+        module_state_dict = module.state_dict()
+        for key in avg_state_dict:
+            avg_state_dict[key] += module_state_dict[key]
+    for key in avg_state_dict:
+        avg_state_dict[key] /= len(src_modules)
+    return avg_state_dict
+class ModuleDict_W(nn.Module): # Wrapper of ModuleDict
+    def __init__(self, modules: list, keys: list):
+        super().__init__()
+        assert len(keys) == len(modules), f"{len(keys)=} {len(modules)=}"
+        self._keys = [int(k) for k in keys]
+        self._moduleDict = nn.ModuleDict({str(int(k)): m for k, m in zip(self._keys, modules)})
+    def __getitem__(self, k: int):
+        _k = str(int(k))
+        return self._moduleDict[_k]
+    def keys(self):
+        return list(self._keys)
+    def forward(self, *args, **kwargs):
+        cur_task = global_.task
+        assert cur_task in self._keys, f"Current task {cur_task} not in available tasks {self._keys}"
+        return self._moduleDict[str(int(cur_task))](*args, **kwargs)
+    def offload_unused_tasks(self, unused_tasks, method: str):
+        for i in unused_tasks:
+            _k = str(int(i))
+            if _k in self._moduleDict:
+                if method == 'del':
+                    # self._moduleDict[_k] = None # should behave the same either way
+                    del self._moduleDict[_k]
+                elif method == 'cpu':
+                    self._moduleDict[_k].to('cpu')
+                else:
+                    raise
+class TaskSpecific_MoE(nn.Module):
+    def __init__(
+        self,
+        module:nn.Module,# or list of Module
+        tasks:tuple,
+    ):
+        super().__init__()
+        self.cur_task = None
+        self.tasks = tasks
+        if isinstance(module, nn.Module):
+            modules = [copy.deepcopy(module) for _ in self.tasks]
+        elif isinstance(module, list):
+            assert len(module) == len(self.tasks), f"got {len(module)} and {len(self.tasks)}"
+            modules = module
+        else:
+            raise ValueError(f"got {type(module)}")
+        self.tasks_2_module = ModuleDict_W(modules, self.tasks)
+    def forward(self, *args, **kwargs) -> torch.Tensor:
+        # cur_task = self.cur_task
+        cur_task = global_.task
+        assert cur_task in self.tasks, f"Current task {cur_task} not in available tasks {self.tasks}"
+        return self.tasks_2_module[cur_task](*args, **kwargs)
+    def set_task(self, task):
+        assert 0, 'set_task is disabled for now; update to gg.task instead'
+        # assert task in self.tasks, f"Task {task} not in available tasks {self.tasks}"
+        self.cur_task = task
+def is_task_specific_(name:str):
+    is_task_specific = (
+        ('._moduleDict.' in name) or
+        ('tasks_2_module' in name) or
+        ('task_ffn' in name) or
+        ('task_proj' in name) or
+        ('task_conv' in name) or
+        ('task_gate_mlps' in name) or
+        ('task_lora' in name) or
+        ('encoder_clip_' in name) or
+        ('proj_out_source__' in name) or
+        ('ID_proj_out' in name) or
+        ('landmark_proj_out' in name) or
+        ('learnable_vector' in name)
+    )
+    return is_task_specific
+def tp_param_need_sync(name: str, p: torch.nn.Parameter):
+    if is_task_specific_(name):
+        return False, True
+    if 'first_stage_model' in name or 'face_ID_model' in name or 'encoder_clip_face.tokenizer' in name or 'encoder_clip_face.model' in name:
+        return False, False
+    if not p.requires_grad:
+        return False, False
+    return True, False
+def offload_unused_tasks(parent: nn.Module, active_task: int, method: str, ):
+    unused_tasks = [_t for _t in TASKS if _t != active_task] # inactive tasks
+    for name, child in parent.named_children():
+        if hasattr(child, '__class__') and child.__class__.__name__ in [
+            'TaskSpecific_MoE',
+            'FFN_TaskSpecific_Plus_Shared',
+            'Linear_TaskSpecific_Plus_Shared',
+            'Conv_TaskSpecific_Plus_Shared',
+            'FFN_Shared_Plus_TaskLoRA',
+            'Linear_Shared_Plus_TaskLoRA',
+            'Conv_Shared_Plus_TaskLoRA',
+        ]:
+            for attr_name in [ # normalize attribute handling to avoid repetition
+                'tasks_2_module',
+                'task_ffn', 'task_proj', 'task_conv',
+                'task_lora_in', 'task_lora_out', 'task_lora',
+            ]:
+                if hasattr(child, attr_name):
+                    ml = getattr(child, attr_name)
+                    if isinstance(ml, nn.ModuleList):
+                        for i in unused_tasks:  # move or delete parameters for inactive tasks
+                            if method == 'del':
+                                ml[i] = None
+                            elif method == 'cpu':
+                                ml[i].to('cpu')
+                            else:  raise Exception
+                    elif isinstance(ml, ModuleDict_W):
+                        ml.offload_unused_tasks(unused_tasks,method)
+            # recurse(child)
+        else:  offload_unused_tasks(child,active_task,method)
+def offload_unused_tasks__LD(modelMOE, task_keep: int, method: str, ):
+    # Remove or offload inactive task-related parameters to save CUDA memory (method: del|cpu)
+    offload_unused_tasks(modelMOE, task_keep, method)

Other_dependencies/arcface/add.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Add arcface model

Other_dependencies/arcface/model_ir_se50.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a035c768259b98ab1ce0e646312f48b9e1e218197a0f80ac6765e88f8b6ddf28
+size 175367323

Other_dependencies/face_parsing/79999_iter.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:468e13ca13a9b43cc0881a9f99083a430e9c0a38abd935431d1c28ee94b26567
+size 53289463

Other_dependencies/face_parsing/add.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Add face parsing model

Other_dependencies/mp_models/blaze_face_short_range.tflite ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4578f35940bf5a1a655214a1cce5cab13eba73c1297cd78e1a04c2380b0152f
+size 229746

Other_dependencies/mp_models/face_landmarker_v2_with_blendshapes.task ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64184e229b263107bc2b804c6625db1341ff2bb731874b0bcc2fe6544e0bc9ff
+size 3758596

app.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Hugging Face Space demo for UniBioTransfer.
+Gradio interface for face/hair/motion/head transfer.
+ZeroGPU Compatible:
+- Model initialized on CPU (no GPU memory during startup)
+- Inference wrapped with @spaces.GPU decorator
+- Thread-safe global variable access with Lock
+"""
+import threading
+import torch
+from PIL import Image
+import numpy as np
+# ==========================================
+# 兼容层：处理本地测试 vs HF ZeroGPU 环境
+# ==========================================
+try:
+    import spaces
+    print("Detected spaces library (Hugging Face environment).")
+except ImportError:
+    print("Local environment detected. Mocking spaces.GPU...")
+    class spaces:
+        @staticmethod
+        def GPU(func):
+            return func  # 本地测试时，装饰器变为空壳，直接执行原函数
+from infer_hf import UniBioTransferPipeline
+# 锁和全局单例 Pipeline
+inference_lock = threading.Lock()
+global_pipeline :UniBioTransferPipeline = None
+def get_pipeline(task):
+    """
+    单例模式：全局只初始化一次模型（放在 CPU），后续只切换任务。
+    强制写死 CPU，保证 ZeroGPU 全局初始化时不碰显卡。
+    """
+    global global_pipeline
+    if global_pipeline is None:
+        print("Initializing pipeline once on CPU...")
+        # 强制写死 CPU，保证 ZeroGPU 全局初始化时不碰显卡
+        global_pipeline = UniBioTransferPipeline.from_pretrained(
+            repo_id="scy639/UniBioTransfer",
+            task=task,
+            device="cpu",
+        )
+    else:
+        # 如果模型已经在内存中，只需切换 task ID 即可
+        print(f"Switching existing pipeline to task: {task}")
+        global_pipeline.set_task(task)
+    return global_pipeline
+# 核心：将所有会用到 GPU 的前向推理逻辑包裹在这里
+@spaces.GPU
+def run_gpu_inference(pipeline:UniBioTransferPipeline, tgt_pil, ref_pil, ddim_steps, scale, seed, num_images):
+    """
+    这里是 ZeroGPU 分配算力的地方。进入此函数时可以安全地 to("cuda")。
+    如果是在本地服务器，这个装饰器没用，但内部的 .to("cuda") 同样生效。
+    """
+    return pipeline(
+        tgt_pil,
+        ref_pil,
+        ddim_steps=ddim_steps,
+        scale=scale,
+        seed=seed,
+        num_images=num_images,
+    )
+def inference(task, tgt_img, ref_img, ddim_steps, seed, num_images):
+    """
+    Run inference for the demo.
+    """
+    if tgt_img is None or ref_img is None:
+        return None, "Please upload both target and reference images."
+    try:
+        # 1. 拿模型 (此时模型在 CPU)
+        pipeline = get_pipeline(task)
+        tgt_pil = Image.fromarray(tgt_img).convert("RGB")
+        ref_pil = Image.fromarray(ref_img).convert("RGB")
+        # 2. 加锁，防止并发污染 global_.task，进入 GPU 推理
+        with inference_lock:
+            results = run_gpu_inference(
+                pipeline,
+                tgt_pil,
+                ref_pil,
+                int(ddim_steps),
+                float(3),
+                int(seed),
+                int(num_images)
+            )
+        return results, f"Success! Task: {task} transfer completed."
+    except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        print(f"{error_msg}")
+        return None, error_msg
+def create_demo():
+    """Create Gradio demo interface."""
+    import gradio as gr
+    with gr.Blocks(title="UniBioTransfer") as demo:
+        gr.Markdown(
+            """
+            # UniBioTransfer
+            Perform face transfer, hair transfer, motion transfer (face reenactment), and head transfer.
+            - **Face Transfer**: Transfer face identity from reference to target
+            - **Hair Transfer**: Transfer hairstyle from reference to target
+            - **Motion Transfer**: Transfer motion(expression+head pose) from reference to target
+            - **Head Transfer**: Transfer entire head from reference to target
+            [Code](https://github.com/scy639/UniBioTransfer)
+            [Project Page](https://scy639.github.io/UniBioTransfer.github.io/)
+            [Paper](https://arxiv.org/abs/2603.19637)
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                task_dropdown = gr.Dropdown(
+                    choices=["face", "hair", "motion", "head"],
+                    value="face",
+                    label="Task",
+                    info="Select the transfer type",
+                )
+                with gr.Row():
+                    tgt_image = gr.Image(
+                        label="Target Image",
+                        type="numpy",
+                        height=300,
+                    )
+                    ref_image = gr.Image(
+                        label="Reference Image",
+                        type="numpy",
+                        height=300,
+                    )
+                with gr.Row():
+                    ddim_steps = gr.Slider(
+                        minimum=4,
+                        maximum=50,
+                        value=50,
+                        step=1,
+                        label="DDIM Steps",
+                        info="More steps = better quality but slower",
+                    )
+                    # scale = gr.Slider(
+                    #     minimum=1.0,
+                    #     maximum=10.0,
+                    #     value=3.0,
+                    #     step=0.5,
+                    #     label="CFG Scale",
+                    #     info="Guidance scale for conditioning",
+                    # )
+                seed = gr.Number(
+                    value=42,
+                    label="Random Seed",
+                    info="For reproducibility",
+                )
+                num_images = gr.Slider(
+                    minimum=1,
+                    maximum=32,
+                    value=4,
+                    step=1,
+                    label="Number of output images",
+                    info="Multi-output with different initial noise",
+                )
+                run_btn = gr.Button("Run Inference", variant="primary")
+            with gr.Column():
+                output_gallery = gr.Gallery(
+                    label="Results",
+                    height=800,
+                    columns=2,
+                )
+                status_text = gr.Textbox(
+                    label="Status",
+                    lines=3,
+                )
+        gr.Markdown(
+"""
+### Usage
+1. Upload a **target image** (the person whose face/hair/motion/head will be modified)
+2. Upload a **reference image** (the source of the attribute to transfer)
+3. Select the **task** type
+4. Click "Run Inference"
+### Requirements
+- Works best when the heads in the two input images have similar sizes.
+"""
+        )
+        run_btn.click(
+            fn=inference,
+            inputs=[task_dropdown, tgt_image, ref_image, ddim_steps, seed, num_images],
+            outputs=[output_gallery, status_text],
+        )
+        task_dropdown.change(
+            fn=lambda t: f"Task switched to: {t} transfer",
+            inputs=[task_dropdown],
+            outputs=[status_text],
+        )
+        gr.Examples(
+            examples=[
+                ["face", "examples/face/tgt.png", "examples/face/ref.png",       20, 42, 4],
+                ["hair", "examples/hair/tgt.png", "examples/hair/ref.png",       20, 42, 4],
+                ["motion", "examples/motion/tgt.png", "examples/motion/ref.png", 20, 42, 4],
+                ["head", "examples/head/tgt.png", "examples/head/ref.png",       20, 42, 4],
+            ],
+            inputs=[task_dropdown, tgt_image, ref_image, ddim_steps, seed, num_images],
+            label="Examples",
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()

checkpoints/pretrained.json ADDED Viewed

	@@ -0,0 +1,1072 @@

+{
+  ".model.diffusion_model.input_blocks.0.0": [
+    4,
+    4,
+    4,
+    4
+  ],
+  ".model.diffusion_model.input_blocks.1.0.in_layers.2": [
+    5,
+    4,
+    8,
+    4
+  ],
+  ".model.diffusion_model.input_blocks.1.0.out_layers.3": [
+    7,
+    4,
+    12,
+    4
+  ],
+  ".model.diffusion_model.input_blocks.1.1.proj_in": [
+    4,
+    4,
+    6,
+    4
+  ],
+  ".model.diffusion_model.input_blocks.1.1.transformer_blocks.0.ff": [
+    [
+      5,
+      4,
+      8,
+      4
+    ],
+    [
+      7,
+      4,
+      12,
+      4
+    ]
+  ],
+  ".model.diffusion_model.input_blocks.1.1.proj_out": [
+    4,
+    4,
+    8,
+    4
+  ],
+  ".model.diffusion_model.input_blocks.2.0.in_layers.2": [
+    14,
+    5,
+    19,
+    4
+  ],
+  ".model.diffusion_model.input_blocks.2.0.out_layers.3": [
+    16,
+    4,
+    15,
+    4
+  ],
+  ".model.diffusion_model.input_blocks.2.1.proj_in": [
+    9,
+    4,
+    11,
+    4
+  ],
+  ".model.diffusion_model.input_blocks.2.1.transformer_blocks.0.ff": [
+    [
+      16,
+      4,
+      14,
+      4
+    ],
+    [
+      17,
+      4,
+      14,
+      4
+    ]
+  ],
+  ".model.diffusion_model.input_blocks.2.1.proj_out": [
+    13,
+    4,
+    11,
+    4
+  ],
+  ".model.diffusion_model.input_blocks.3.0.op": [
+    26,
+    7,
+    31,
+    8
+  ],
+  ".model.diffusion_model.input_blocks.4.0.in_layers.2": [
+    23,
+    6,
+    31,
+    8
+  ],
+  ".model.diffusion_model.input_blocks.4.0.out_layers.3": [
+    27,
+    6,
+    37,
+    8
+  ],
+  ".model.diffusion_model.input_blocks.4.0.skip_connection": [
+    20,
+    6,
+    22,
+    6
+  ],
+  ".model.diffusion_model.input_blocks.4.1.proj_in": [
+    20,
+    6,
+    28,
+    7
+  ],
+  ".model.diffusion_model.input_blocks.4.1.transformer_blocks.0.ff": [
+    [
+      22,
+      6,
+      37,
+      8
+    ],
+    [
+      31,
+      8,
+      39,
+      10
+    ]
+  ],
+  ".model.diffusion_model.input_blocks.4.1.proj_out": [
+    26,
+    8,
+    37,
+    10
+  ],
+  ".model.diffusion_model.input_blocks.5.0.in_layers.2": [
+    27,
+    10,
+    46,
+    11
+  ],
+  ".model.diffusion_model.input_blocks.5.0.out_layers.3": [
+    18,
+    6,
+    36,
+    7
+  ],
+  ".model.diffusion_model.input_blocks.5.1.proj_in": [
+    20,
+    7,
+    29,
+    7
+  ],
+  ".model.diffusion_model.input_blocks.5.1.transformer_blocks.0.ff": [
+    [
+      22,
+      7,
+      41,
+      9
+    ],
+    [
+      26,
+      10,
+      33,
+      12
+    ]
+  ],
+  ".model.diffusion_model.input_blocks.5.1.proj_out": [
+    24,
+    9,
+    33,
+    10
+  ],
+  ".model.diffusion_model.input_blocks.6.0.op": [
+    52,
+    17,
+    76,
+    20
+  ],
+  ".model.diffusion_model.input_blocks.7.0.in_layers.2": [
+    50,
+    14,
+    80,
+    19
+  ],
+  ".model.diffusion_model.input_blocks.7.0.out_layers.3": [
+    56,
+    15,
+    90,
+    22
+  ],
+  ".model.diffusion_model.input_blocks.7.0.skip_connection": [
+    40,
+    13,
+    59,
+    16
+  ],
+  ".model.diffusion_model.input_blocks.7.1.proj_in": [
+    33,
+    12,
+    55,
+    14
+  ],
+  ".model.diffusion_model.input_blocks.7.1.transformer_blocks.0.ff": [
+    [
+      39,
+      11,
+      62,
+      13
+    ],
+    [
+      59,
+      17,
+      82,
+      21
+    ]
+  ],
+  ".model.diffusion_model.input_blocks.7.1.proj_out": [
+    55,
+    17,
+    80,
+    22
+  ],
+  ".model.diffusion_model.input_blocks.8.0.in_layers.2": [
+    73,
+    20,
+    108,
+    27
+  ],
+  ".model.diffusion_model.input_blocks.8.0.out_layers.3": [
+    65,
+    15,
+    95,
+    21
+  ],
+  ".model.diffusion_model.input_blocks.8.1.proj_in": [
+    43,
+    13,
+    69,
+    18
+  ],
+  ".model.diffusion_model.input_blocks.8.1.transformer_blocks.0.ff": [
+    [
+      41,
+      10,
+      68,
+      13
+    ],
+    [
+      56,
+      17,
+      85,
+      21
+    ]
+  ],
+  ".model.diffusion_model.input_blocks.8.1.proj_out": [
+    52,
+    16,
+    78,
+    20
+  ],
+  ".model.diffusion_model.input_blocks.9.0.op": [
+    90,
+    30,
+    157,
+    39
+  ],
+  ".model.diffusion_model.input_blocks.10.0.in_layers.2": [
+    81,
+    21,
+    113,
+    26
+  ],
+  ".model.diffusion_model.input_blocks.10.0.out_layers.3": [
+    80,
+    21,
+    123,
+    28
+  ],
+  ".model.diffusion_model.input_blocks.11.0.in_layers.2": [
+    87,
+    23,
+    118,
+    28
+  ],
+  ".model.diffusion_model.input_blocks.11.0.out_layers.3": [
+    77,
+    20,
+    113,
+    26
+  ],
+  ".model.diffusion_model.middle_block.0.in_layers.2": [
+    84,
+    22,
+    113,
+    26
+  ],
+  ".model.diffusion_model.middle_block.0.out_layers.3": [
+    68,
+    16,
+    99,
+    21
+  ],
+  ".model.diffusion_model.middle_block.1.proj_in": [
+    36,
+    10,
+    59,
+    13
+  ],
+  ".model.diffusion_model.middle_block.1.transformer_blocks.0.ff": [
+    [
+      31,
+      5,
+      45,
+      6
+    ],
+    [
+      55,
+      15,
+      69,
+      17
+    ]
+  ],
+  ".model.diffusion_model.middle_block.1.proj_out": [
+    39,
+    10,
+    61,
+    14
+  ],
+  ".model.diffusion_model.middle_block.2.in_layers.2": [
+    73,
+    17,
+    104,
+    23
+  ],
+  ".model.diffusion_model.middle_block.2.out_layers.3": [
+    62,
+    15,
+    88,
+    20
+  ],
+  ".model.diffusion_model.output_blocks.0.0.in_layers.2": [
+    96,
+    25,
+    135,
+    32
+  ],
+  ".model.diffusion_model.output_blocks.0.0.out_layers.3": [
+    86,
+    21,
+    120,
+    28
+  ],
+  ".model.diffusion_model.output_blocks.0.0.skip_connection": [
+    64,
+    21,
+    106,
+    27
+  ],
+  ".model.diffusion_model.output_blocks.1.0.in_layers.2": [
+    94,
+    27,
+    155,
+    36
+  ],
+  ".model.diffusion_model.output_blocks.1.0.out_layers.3": [
+    86,
+    24,
+    136,
+    31
+  ],
+  ".model.diffusion_model.output_blocks.1.0.skip_connection": [
+    72,
+    23,
+    115,
+    29
+  ],
+  ".model.diffusion_model.output_blocks.2.0.in_layers.2": [
+    84,
+    31,
+    164,
+    39
+  ],
+  ".model.diffusion_model.output_blocks.2.0.out_layers.3": [
+    42,
+    19,
+    123,
+    29
+  ],
+  ".model.diffusion_model.output_blocks.2.0.skip_connection": [
+    72,
+    24,
+    110,
+    28
+  ],
+  ".model.diffusion_model.output_blocks.2.1.conv": [
+    72,
+    25,
+    121,
+    29
+  ],
+  ".model.diffusion_model.output_blocks.3.0.in_layers.2": [
+    85,
+    31,
+    158,
+    38
+  ],
+  ".model.diffusion_model.output_blocks.3.0.out_layers.3": [
+    42,
+    21,
+    117,
+    25
+  ],
+  ".model.diffusion_model.output_blocks.3.0.skip_connection": [
+    71,
+    23,
+    111,
+    28
+  ],
+  ".model.diffusion_model.output_blocks.3.1.proj_in": [
+    42,
+    14,
+    73,
+    18
+  ],
+  ".model.diffusion_model.output_blocks.3.1.transformer_blocks.0.ff": [
+    [
+      37,
+      10,
+      68,
+      13
+    ],
+    [
+      60,
+      18,
+      83,
+      20
+    ]
+  ],
+  ".model.diffusion_model.output_blocks.3.1.proj_out": [
+    51,
+    18,
+    79,
+    21
+  ],
+  ".model.diffusion_model.output_blocks.4.0.in_layers.2": [
+    104,
+    32,
+    159,
+    40
+  ],
+  ".model.diffusion_model.output_blocks.4.0.out_layers.3": [
+    83,
+    24,
+    125,
+    29
+  ],
+  ".model.diffusion_model.output_blocks.4.0.skip_connection": [
+    73,
+    22,
+    101,
+    28
+  ],
+  ".model.diffusion_model.output_blocks.4.1.proj_in": [
+    49,
+    15,
+    77,
+    20
+  ],
+  ".model.diffusion_model.output_blocks.4.1.transformer_blocks.0.ff": [
+    [
+      38,
+      11,
+      70,
+      14
+    ],
+    [
+      63,
+      16,
+      85,
+      20
+    ]
+  ],
+  ".model.diffusion_model.output_blocks.4.1.proj_out": [
+    51,
+    18,
+    81,
+    21
+  ],
+  ".model.diffusion_model.output_blocks.5.0.in_layers.2": [
+    91,
+    33,
+    161,
+    40
+  ],
+  ".model.diffusion_model.output_blocks.5.0.out_layers.3": [
+    83,
+    26,
+    140,
+    32
+  ],
+  ".model.diffusion_model.output_blocks.5.0.skip_connection": [
+    81,
+    24,
+    116,
+    30
+  ],
+  ".model.diffusion_model.output_blocks.5.1.proj_in": [
+    48,
+    16,
+    82,
+    21
+  ],
+  ".model.diffusion_model.output_blocks.5.1.transformer_blocks.0.ff": [
+    [
+      34,
+      12,
+      76,
+      15
+    ],
+    [
+      55,
+      16,
+      81,
+      18
+    ]
+  ],
+  ".model.diffusion_model.output_blocks.5.1.proj_out": [
+    57,
+    19,
+    85,
+    22
+  ],
+  ".model.diffusion_model.output_blocks.5.2.conv": [
+    108,
+    34,
+    159,
+    41
+  ],
+  ".model.diffusion_model.output_blocks.6.0.in_layers.2": [
+    55,
+    18,
+    87,
+    22
+  ],
+  ".model.diffusion_model.output_blocks.6.0.out_layers.3": [
+    32,
+    13,
+    54,
+    15
+  ],
+  ".model.diffusion_model.output_blocks.6.0.skip_connection": [
+    25,
+    9,
+    30,
+    14
+  ],
+  ".model.diffusion_model.output_blocks.6.1.proj_in": [
+    26,
+    9,
+    40,
+    11
+  ],
+  ".model.diffusion_model.output_blocks.6.1.transformer_blocks.0.ff": [
+    [
+      25,
+      8,
+      47,
+      12
+    ],
+    [
+      36,
+      11,
+      47,
+      13
+    ]
+  ],
+  ".model.diffusion_model.output_blocks.6.1.proj_out": [
+    23,
+    10,
+    38,
+    12
+  ],
+  ".model.diffusion_model.output_blocks.7.0.in_layers.2": [
+    55,
+    18,
+    82,
+    20
+  ],
+  ".model.diffusion_model.output_blocks.7.0.out_layers.3": [
+    47,
+    14,
+    65,
+    17
+  ],
+  ".model.diffusion_model.output_blocks.7.0.skip_connection": [
+    40,
+    11,
+    40,
+    12
+  ],
+  ".model.diffusion_model.output_blocks.7.1.proj_in": [
+    27,
+    9,
+    41,
+    11
+  ],
+  ".model.diffusion_model.output_blocks.7.1.transformer_blocks.0.ff": [
+    [
+      27,
+      8,
+      47,
+      11
+    ],
+    [
+      34,
+      11,
+      47,
+      12
+    ]
+  ],
+  ".model.diffusion_model.output_blocks.7.1.proj_out": [
+    33,
+    9,
+    39,
+    12
+  ],
+  ".model.diffusion_model.output_blocks.8.0.in_layers.2": [
+    58,
+    17,
+    82,
+    20
+  ],
+  ".model.diffusion_model.output_blocks.8.0.out_layers.3": [
+    56,
+    15,
+    75,
+    18
+  ],
+  ".model.diffusion_model.output_blocks.8.0.skip_connection": [
+    44,
+    10,
+    47,
+    11
+  ],
+  ".model.diffusion_model.output_blocks.8.1.proj_in": [
+    32,
+    9,
+    43,
+    10
+  ],
+  ".model.diffusion_model.output_blocks.8.1.transformer_blocks.0.ff": [
+    [
+      28,
+      7,
+      47,
+      8
+    ],
+    [
+      35,
+      8,
+      45,
+      8
+    ]
+  ],
+  ".model.diffusion_model.output_blocks.8.1.proj_out": [
+    35,
+    10,
+    44,
+    10
+  ],
+  ".model.diffusion_model.output_blocks.8.2.conv": [
+    65,
+    19,
+    85,
+    22
+  ],
+  ".model.diffusion_model.output_blocks.9.0.in_layers.2": [
+    37,
+    10,
+    35,
+    10
+  ],
+  ".model.diffusion_model.output_blocks.9.0.out_layers.3": [
+    28,
+    6,
+    23,
+    5
+  ],
+  ".model.diffusion_model.output_blocks.9.0.skip_connection": [
+    15,
+    4,
+    4,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.9.1.proj_in": [
+    16,
+    4,
+    6,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.9.1.transformer_blocks.0.ff": [
+    [
+      24,
+      5,
+      23,
+      5
+    ],
+    [
+      23,
+      5,
+      24,
+      6
+    ]
+  ],
+  ".model.diffusion_model.output_blocks.9.1.proj_out": [
+    16,
+    4,
+    14,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.10.0.in_layers.2": [
+    31,
+    9,
+    38,
+    10
+  ],
+  ".model.diffusion_model.output_blocks.10.0.out_layers.3": [
+    20,
+    4,
+    24,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.10.0.skip_connection": [
+    4,
+    4,
+    7,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.10.1.proj_in": [
+    6,
+    4,
+    11,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.10.1.transformer_blocks.0.ff": [
+    [
+      17,
+      4,
+      21,
+      4
+    ],
+    [
+      17,
+      5,
+      21,
+      5
+    ]
+  ],
+  ".model.diffusion_model.output_blocks.10.1.proj_out": [
+    9,
+    4,
+    12,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.11.0.in_layers.2": [
+    7,
+    4,
+    18,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.11.0.out_layers.3": [
+    16,
+    6,
+    22,
+    5
+  ],
+  ".model.diffusion_model.output_blocks.11.0.skip_connection": [
+    4,
+    4,
+    4,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.11.1.proj_in": [
+    9,
+    4,
+    13,
+    4
+  ],
+  ".model.diffusion_model.output_blocks.11.1.transformer_blocks.0.ff": [
+    [
+      19,
+      4,
+      24,
+      4
+    ],
+    [
+      12,
+      4,
+      14,
+      4
+    ]
+  ],
+  ".model.diffusion_model.output_blocks.11.1.proj_out": [
+    7,
+    4,
+    10,
+    4
+  ],
+  ".model.diffusion_model.out.2": [
+    4,
+    4,
+    4,
+    4
+  ],
+  ".model.diffusion_model_refNet.input_blocks.0.0": [
+    4,
+    4,
+    4,
+    4
+  ],
+  ".model.diffusion_model_refNet.input_blocks.1.0.in_layers.2": [
+    17,
+    8,
+    26,
+    8
+  ],
+  ".model.diffusion_model_refNet.input_blocks.1.0.out_layers.3": [
+    21,
+    14,
+    37,
+    12
+  ],
+  ".model.diffusion_model_refNet.input_blocks.1.1.proj_in": [
+    11,
+    8,
+    19,
+    6
+  ],
+  ".model.diffusion_model_refNet.input_blocks.1.1.transformer_blocks.0.ff": [
+    [
+      14,
+      12,
+      24,
+      7
+    ],
+    [
+      17,
+      12,
+      26,
+      7
+    ]
+  ],
+  ".model.diffusion_model_refNet.input_blocks.1.1.proj_out": [
+    11,
+    7,
+    20,
+    5
+  ],
+  ".model.diffusion_model_refNet.input_blocks.2.0.in_layers.2": [
+    27,
+    15,
+    40,
+    13
+  ],
+  ".model.diffusion_model_refNet.input_blocks.2.0.out_layers.3": [
+    26,
+    15,
+    38,
+    12
+  ],
+  ".model.diffusion_model_refNet.input_blocks.2.1.proj_in": [
+    15,
+    7,
+    21,
+    6
+  ],
+  ".model.diffusion_model_refNet.input_blocks.2.1.transformer_blocks.0.ff": [
+    [
+      17,
+      13,
+      30,
+      9
+    ],
+    [
+      16,
+      12,
+      27,
+      8
+    ]
+  ],
+  ".model.diffusion_model_refNet.input_blocks.2.1.proj_out": [
+    12,
+    7,
+    18,
+    6
+  ],
+  ".model.diffusion_model_refNet.input_blocks.3.0.op": [
+    27,
+    13,
+    43,
+    12
+  ],
+  ".model.diffusion_model_refNet.input_blocks.4.0.in_layers.2": [
+    30,
+    19,
+    49,
+    14
+  ],
+  ".model.diffusion_model_refNet.input_blocks.4.0.out_layers.3": [
+    32,
+    26,
+    55,
+    15
+  ],
+  ".model.diffusion_model_refNet.input_blocks.4.0.skip_connection": [
+    22,
+    10,
+    30,
+    9
+  ],
+  ".model.diffusion_model_refNet.input_blocks.4.1.proj_in": [
+    22,
+    14,
+    35,
+    10
+  ],
+  ".model.diffusion_model_refNet.input_blocks.4.1.transformer_blocks.0.ff": [
+    [
+      26,
+      25,
+      52,
+      14
+    ],
+    [
+      28,
+      22,
+      51,
+      14
+    ]
+  ],
+  ".model.diffusion_model_refNet.input_blocks.4.1.proj_out": [
+    24,
+    15,
+    40,
+    11
+  ],
+  ".model.diffusion_model_refNet.input_blocks.5.0.in_layers.2": [
+    44,
+    30,
+    78,
+    22
+  ],
+  ".model.diffusion_model_refNet.input_blocks.5.0.out_layers.3": [
+    28,
+    29,
+    56,
+    15
+  ],
+  ".model.diffusion_model_refNet.input_blocks.5.1.proj_in": [
+    20,
+    13,
+    34,
+    9
+  ],
+  ".model.diffusion_model_refNet.input_blocks.5.1.transformer_blocks.0.ff": [
+    [
+      26,
+      27,
+      52,
+      14
+    ],
+    [
+      23,
+      23,
+      53,
+      14
+    ]
+  ],
+  ".model.diffusion_model_refNet.input_blocks.5.1.proj_out": [
+    17,
+    14,
+    36,
+    10
+  ],
+  ".model.diffusion_model_refNet.input_blocks.6.0.op": [
+    46,
+    31,
+    82,
+    21
+  ],
+  ".model.diffusion_model_refNet.input_blocks.7.0.in_layers.2": [
+    75,
+    41,
+    116,
+    32
+  ],
+  ".model.diffusion_model_refNet.input_blocks.7.0.out_layers.3": [
+    67,
+    50,
+    108,
+    29
+  ],
+  ".model.diffusion_model_refNet.input_blocks.7.0.skip_connection": [
+    31,
+    19,
+    59,
+    15
+  ],
+  ".model.diffusion_model_refNet.input_blocks.7.1.proj_in": [
+    36,
+    29,
+    73,
+    19
+  ],
+  ".model.diffusion_model_refNet.input_blocks.7.1.transformer_blocks.0.ff": [
+    [
+      74,
+      61,
+      106,
+      26
+    ],
+    [
+      63,
+      49,
+      90,
+      24
+    ]
+  ],
+  ".model.diffusion_model_refNet.input_blocks.7.1.proj_out": [
+    34,
+    29,
+    68,
+    18
+  ],
+  ".model.diffusion_model_refNet.input_blocks.8.0.in_layers.2": [
+    92,
+    56,
+    128,
+    36
+  ],
+  ".model.diffusion_model_refNet.input_blocks.8.0.out_layers.3": [
+    43,
+    51,
+    66,
+    16
+  ],
+  ".model.diffusion_model_refNet.input_blocks.8.1.proj_in": [
+    26,
+    28,
+    59,
+    15
+  ],
+  ".model.diffusion_model_refNet.input_blocks.8.1.transformer_blocks.0.ff": [
+    [
+      188,
+      69,
+      232,
+      69
+    ],
+    [
+      140,
+      51,
+      173,
+      51
+    ]
+  ],
+  ".model.diffusion_model_refNet.input_blocks.8.1.proj_out": [
+    91,
+    33,
+    113,
+    33
+  ]
+}

download_checkpoints.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pathlib import Path
+import os
+from imports import *
+def _download(repo_id, filename, local_path: Path) -> Path:
+    local_path = Path(local_path)
+    from huggingface_hub import hf_hub_download
+    local_path.parent.mkdir(parents=True, exist_ok=True)
+    token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+    print(f"downloading to {local_path}")
+    downloaded = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        local_dir=str(local_path.parent),
+        local_dir_use_symlinks=False,
+        token=token,
+    )
+_download("CompVis/stable-diffusion-v-1-4-original",SD14_filename, SD14_localpath)
+_download("scy639/UniBioTransfer",PRETRAIN_CKPT_PATH, ".")
+_download("scy639/UniBioTransfer",PRETRAIN_JSON_PATH, ".")
+_download("scy639/UniBioTransfer","Other_dependencies/arcface/model_ir_se50.pth", ".")
+_download("scy639/UniBioTransfer","Other_dependencies/face_parsing/79999_iter.pth", ".")

eval_tool/lpips/__init__.py ADDED Viewed

File without changes

eval_tool/lpips/lpips.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+import torch.nn as nn
+from eval_tool.lpips.networks import get_network, LinLayers
+from eval_tool.lpips.utils import get_state_dict
+class LPIPS(nn.Module):
+    r"""Creates a criterion that measures
+    Learned Perceptual Image Patch Similarity (LPIPS).
+    Arguments:
+        net_type (str): the network type to compare the features:
+                        'alex' | 'squeeze' | 'vgg'. Default: 'alex'.
+        version (str): the version of LPIPS. Default: 0.1.
+    """
+    def __init__(self, net_type: str = 'alex', version: str = '0.1'):
+        assert version in ['0.1'], 'v0.1 is only supported now'
+        super(LPIPS, self).__init__()
+        # pretrained network
+        self.net = get_network(net_type)
+        # linear layers
+        self.lin = LinLayers(self.net.n_channels_list)
+        self.lin.load_state_dict(get_state_dict(net_type, version))
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        feat_x, feat_y = self.net(x), self.net(y)
+        diff = [(fx - fy) ** 2 for fx, fy in zip(feat_x, feat_y)]
+        res = [l(d).mean((2, 3), True) for d, l in zip(diff, self.lin)]
+        return torch.sum(torch.cat(res, 0)) / x.shape[0]

eval_tool/lpips/networks.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import Sequence
+from itertools import chain
+import torch
+import torch.nn as nn
+from torchvision import models
+from eval_tool.lpips.utils import normalize_activation
+def get_network(net_type: str):
+    if net_type == 'alex':
+        return AlexNet()
+    elif net_type == 'squeeze':
+        return SqueezeNet()
+    elif net_type == 'vgg':
+        return VGG16()
+    else:
+        raise NotImplementedError('choose net_type from [alex, squeeze, vgg].')
+class LinLayers(nn.ModuleList):
+    def __init__(self, n_channels_list: Sequence[int]):
+        super(LinLayers, self).__init__([
+            nn.Sequential(
+                nn.Identity(),
+                nn.Conv2d(nc, 1, 1, 1, 0, bias=False)
+            ) for nc in n_channels_list
+        ])
+        for param in self.parameters():
+            param.requires_grad = False
+class BaseNet(nn.Module):
+    def __init__(self):
+        super(BaseNet, self).__init__()
+        # register buffer
+        self.register_buffer(
+            'mean', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer(
+            'std', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def set_requires_grad(self, state: bool):
+        for param in chain(self.parameters(), self.buffers()):
+            param.requires_grad = state
+    def z_score(self, x: torch.Tensor):
+        return (x - self.mean) / self.std
+    def forward(self, x: torch.Tensor):
+        x = self.z_score(x)
+        output = []
+        for i, (_, layer) in enumerate(self.layers._modules.items(), 1):
+            x = layer(x)
+            if i in self.target_layers:
+                output.append(normalize_activation(x))
+            if len(output) == len(self.target_layers):
+                break
+        return output
+class SqueezeNet(BaseNet):
+    def __init__(self):
+        super(SqueezeNet, self).__init__()
+        self.layers = models.squeezenet1_1(True).features
+        self.target_layers = [2, 5, 8, 10, 11, 12, 13]
+        self.n_channels_list = [64, 128, 256, 384, 384, 512, 512]
+        self.set_requires_grad(False)
+class AlexNet(BaseNet):
+    def __init__(self):
+        super(AlexNet, self).__init__()
+        self.layers = models.alexnet(True).features
+        self.target_layers = [2, 5, 8, 10, 12]
+        self.n_channels_list = [64, 192, 384, 256, 256]
+        self.set_requires_grad(False)
+class VGG16(BaseNet):
+    def __init__(self):
+        super(VGG16, self).__init__()
+        self.layers = models.vgg16(True).features
+        self.target_layers = [4, 9, 16, 23, 30]
+        self.n_channels_list = [64, 128, 256, 512, 512]
+        self.set_requires_grad(False)

eval_tool/lpips/utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from collections import OrderedDict
+import torch
+def normalize_activation(x, eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x ** 2, dim=1, keepdim=True)+1e-16)  #
+    return x / (norm_factor + eps)
+def get_state_dict(net_type: str = 'alex', version: str = '0.1'):
+    # build url
+    url = 'https://raw.githubusercontent.com/richzhang/PerceptualSimilarity/' \
+        + f'master/lpips/weights/v{version}/{net_type}.pth'
+    # download
+    old_state_dict = torch.hub.load_state_dict_from_url(
+        url, progress=True,
+        map_location=None if torch.cuda.is_available() else torch.device('cpu')
+    )
+    # rename keys
+    new_state_dict = OrderedDict()
+    for key, val in old_state_dict.items():
+        new_key = key
+        new_key = new_key.replace('lin', '')
+        new_key = new_key.replace('model.', '')
+        new_state_dict[new_key] = val
+    return new_state_dict

examples/face/ref-semantic_mask.png ADDED Viewed

examples/face/ref.png ADDED Viewed

Git LFS Details

SHA256: a477d2f5928b4ab40046fdcd7a0b9d4f35d619822eccd4137396fc06dbb82b48
Pointer size: 131 Bytes
Size of remote file: 399 kB

examples/face/tgt-semantic_mask.png ADDED Viewed

examples/face/tgt.png ADDED Viewed

Git LFS Details

SHA256: dea3592ab41c766b8d1ba041eda3b545871f1684528bff5c40321a9fbd7c8546
Pointer size: 131 Bytes
Size of remote file: 410 kB

examples/hair/ref-semantic_mask.png ADDED Viewed

examples/hair/ref.png ADDED Viewed

Git LFS Details

SHA256: 946981b5a077df22a393d6e1ebb1bdef73c020f25e99339f732345777ae6565c
Pointer size: 131 Bytes
Size of remote file: 435 kB

examples/hair/tgt-semantic_mask.png ADDED Viewed

examples/hair/tgt.png ADDED Viewed

Git LFS Details

SHA256: daa1c69651861183fe113995abb20192fafe829a7b1a349c2ccc2713d7b057b4
Pointer size: 131 Bytes
Size of remote file: 399 kB

examples/head/ref-semantic_mask.png ADDED Viewed

examples/head/ref.png ADDED Viewed

Git LFS Details

SHA256: ff89b38ec94ee110a8760c6bb6b316c8ad2f4502a14aec1d217305e0ca2dfa47
Pointer size: 131 Bytes
Size of remote file: 440 kB

examples/head/tgt-semantic_mask.png ADDED Viewed

examples/head/tgt.png ADDED Viewed

Git LFS Details

SHA256: 9467c48978020761d76df2e133808f490f2eacb359f5fda61d08017a77b20151
Pointer size: 131 Bytes
Size of remote file: 336 kB

examples/inputs.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+target_path_1 reference_path_1
+target_path_2 reference_path_2
+target_path_3 reference_path_3
+target_path_4 reference_path_4
+target_path_5 reference_path_5

examples/motion/ref-semantic_mask.png ADDED Viewed

examples/motion/ref.png ADDED Viewed

Git LFS Details

SHA256: 8b58a80c13e5741072b6c603f7edd61ba3e3c9456536064b0d4746f4bab9c786
Pointer size: 131 Bytes
Size of remote file: 424 kB

examples/motion/tgt-semantic_mask.png ADDED Viewed

examples/motion/tgt.png ADDED Viewed

Git LFS Details

SHA256: 6e527760e591e97ab36892ee683f91673a678a8b23b7603779d430cfbcc0e5f3
Pointer size: 131 Bytes
Size of remote file: 427 kB

gen_lmk_and_mask.py ADDED Viewed

	@@ -0,0 +1,41 @@

+ENABLE_lmk_cache = False
+ENABLE_mask_cache = False
+import cv2
+from imports import *
+from util_cv2 import cv2_resize_auto_interpolation
+from Mediapipe_Result_Cache import Mediapipe_Result_Cache
+from lmk_util.lmk_extractor import LandmarkExtractor
+def gen_lmk_and_mask(img_paths, size=512, write_cache=True):
+    extractor = LandmarkExtractor()
+    cache = Mediapipe_Result_Cache()
+    seen = set()
+    for p in img_paths:
+        if not p:
+            continue
+        p = str(p)
+        if p in seen:
+            continue
+        seen.add(p)
+        cache_path = cache.get_path(p)
+        if not  ( cache_path.exists() and ENABLE_lmk_cache ):
+            img = cv2.imread(p)
+            if img is None:
+                print(f"cv2.imread failed: {p}")
+                raise
+                continue
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            img = cv2_resize_auto_interpolation(img, (size, size))
+            lmks = extractor.extract_single(img)
+            if lmks is None:
+                print(f"no lmks: {p}")
+                raise
+                continue
+            if write_cache:
+                cache.set(p, lmks)
+        path_img_2_path_mask(p, reuse_if_exists=ENABLE_mask_cache, label_mode="RF12_")

gen_semantic_mask.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+def:
+    tgt: Target image to be edited (face swapped)
+    ref: Face ID source image (also called src in REFace)
+    swap: Swapped output image, using face ID from ref to replace face in tgt
+"""
+import os
+from pathlib import Path
+from tqdm import tqdm
+from my_py_lib.image_util import print_image_statistics
+import torch
+import torchvision
+from PIL import Image
+import numpy as np
+from einops import rearrange
+from torchvision.transforms import Resize
+from torchvision.utils import make_grid
+from contextlib import nullcontext
+from torch.cuda.amp import autocast
+from omegaconf import OmegaConf
+import cv2
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Sampling configs
+DDIM_STEPS = 50
+GUIDANCE_SCALE = 3.0
+IMG_SIZE = 512
+LATENT_CHANNELS = 4
+DOWNSAMPLE_FACTOR = 8
+START_NOISE_T = 1000
+DDIM_ETA = 0.0
+PRECISION = "full"  # or "autocast"
+FIXED_CODE = False  # whether to use fixed starting code
+SAVE_INTERMEDIATES = False  # whether to save intermediate results
+LOG_EVERY_T = 100  # log frequency during sampling
+class MaskModel_LazyLoader:
+    model = None
+    @classmethod
+    def get(cls):
+        faceParsing_ckpt = "Other_dependencies/face_parsing/79999_iter.pth"
+        if cls.model is None:
+            from pretrained.face_parsing.face_parsing_demo import init_faceParsing_pretrained_model
+            cls.model = init_faceParsing_pretrained_model(
+                'default',
+                faceParsing_ckpt,
+                ''
+            )
+            print(f"Initialized face parsing model from {faceParsing_ckpt}")
+        return cls.model
+def gen_semantic_mask(path_img: Path, path_mask_to_save: Path, label_mode:str, path_vis: Path = None):
+    """Generate semantic mask for an image using face parsing model"""
+    pil_im = Image.open(path_img).convert("RGB")
+    w, h = pil_im.size
+    # print(f"{pil_im.size=}") # 512,512
+    TMP_size = 1024
+    if w != TMP_size or h != TMP_size:
+        pil_im = pil_im.resize((TMP_size, TMP_size), Image.BILINEAR)
+    model = MaskModel_LazyLoader.get()
+    from pretrained.face_parsing.face_parsing_demo import faceParsing_demo, vis_parsing_maps
+    # print(f"{pil_im.size=}") # 1024,1024
+    # Generate mask with conversion to seg12 format
+    mask = faceParsing_demo(
+        model,
+        pil_im,
+        label_mode,
+        model_name='default'
+    )
+    try:
+        Image.fromarray(mask).save(path_mask_to_save)
+    except Exception as e:
+        print(f"{e=}")
+        print(f"{path_mask_to_save=}")
+        if path_mask_to_save.exists():
+            path_mask_to_save.unlink()
+            print(f'path_mask_to_save.unlink()')
+    # print(f"Saved mask: {path_mask_to_save}")
+    # print(f"{mask.shape=}") # 512,512
+    if path_vis:
+        mask_vis = vis_parsing_maps(pil_im, mask)
+        Image.fromarray(mask_vis).save(path_vis)
+        print(f"Saved mask vis: {path_vis}")

get_mask.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from util_and_constant import *
+from pathlib import Path
+from PIL import Image
+import cv2
+import numpy as np
+def path_img_2_mask(
+    path_img,
+    preserve=(1, 2, 3, 5, 6, 7, 9, 10, 11, ), # int | list-liek. Default val represents face
+):
+    """
+    0 bg, 1 mouth, 2 eyebrow, 3 eyes, 4 hair, 5 nose, 6 face (excluding facial parts), 7: ear, 8: neck, 9: tooth
+    10: eye_glass, 11: ear_rings
+    """
+    if isinstance(preserve,int):
+        preserve = (preserve,)
+    if 1:
+        assert isinstance(preserve,tuple) or isinstance(preserve,list)
+        assert all(isinstance(p, int) and 0 <= p <= 11 for p in preserve)
+    import numpy as np
+    from PIL import Image
+    mask_path = path_img_2_path_mask(path_img)
+    mask = Image.open(mask_path).convert('L')
+    mask = np.array(mask)
+    mask = np.isin(mask, preserve)
+    return mask
+def get_forehead_mask(sm_mask):
+    # return mask (np bool) where the forehead (face above eyebrows) is True
+    sm_mask = np.array(sm_mask)
+    # 6 is face (excluding facial parts); keep only the forehead part
+    # First get all face pixels
+    face_mask = (sm_mask == 6)
+    # Get eyebrow pixels to determine forehead boundary
+    # if 2 in sm, ; elif 3(eyes) in ; elif 10(eye_glass) in ; else
+    if 2 in sm_mask:
+        eyebrow_mask = (sm_mask == 2)
+        eyebrow_coords = np.where(eyebrow_mask)
+        eyebrow_top = np.min(eyebrow_coords[0])
+        # Forehead is face region above eyebrows
+        forehead_mask = face_mask & (np.arange(sm_mask.shape[0])[:, None] < eyebrow_top)
+    elif 3 in sm_mask:
+        eye_mask = (sm_mask == 3)
+        eye_coords = np.where(eye_mask)
+        eye_top = np.min(eye_coords[0])
+        # Estimate forehead as region above eyes with some margin
+        forehead_threshold = eye_top - 20  # 20 pixels above eyes as forehead
+        forehead_mask = face_mask & (np.arange(sm_mask.shape[0])[:, None] < forehead_threshold)
+    elif 10 in sm_mask:
+        glass_mask = (sm_mask == 10)
+        glass_coords = np.where(glass_mask)
+        glass_top = np.min(glass_coords[0])
+        # Forehead is face region above glasses
+        forehead_mask = face_mask & (np.arange(sm_mask.shape[0])[:, None] < glass_top)
+    else:
+        # If no eyebrows detected, keep upper portion of face
+        face_coords = np.where(face_mask)
+        if len(face_coords[0]) > 0:
+            face_top = np.min(face_coords[0])
+            face_height = np.max(face_coords[0]) - face_top
+            forehead_threshold = face_top + face_height * 0.15  # top 15% as forehead
+            forehead_mask = face_mask & (np.arange(sm_mask.shape[0])[:, None] < forehead_threshold)
+        else:
+            forehead_mask = np.zeros_like(face_mask, dtype=bool)
+    forehead_mask = forehead_mask & face_mask
+    return forehead_mask

global_.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+some global variables
+"""
+task :int = None # current batch task id
+TP_enable:bool = None # None means not set yet. should be set in imports.py
+rank_:int = None
+moduleName_2_adaRank:dict = {} # adaptive rank for each shared+LoRA module

hf_model.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+Hugging Face Hub compatible model wrapper for UniBioTransfer.
+Provides from_pretrained() and push_to_hub() functionality via PyTorchModelHubMixin.
+"""
+from pathlib import Path
+import torch
+import json
+import copy
+import os
+from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+import global_
+from ldm.models.diffusion.ddpm import LatentDiffusion, LandmarkExtractor
+from ldm.util import instantiate_from_config
+from omegaconf import OmegaConf
+from pytorch_lightning import seed_everything
+from MoE import offload_unused_tasks__LD
+from multiTask_model import TaskSpecific_MoE, replace_modules_lossless
+from my_py_lib.torch_util import cleanup_gpu_memory
+TASKS = (0, 1, 2, 3)
+TASK_NAME2ID = {"face": 0, "hair": 1, "motion": 2, "head": 3}
+TASK_ID2NAME = {v: k for k, v in TASK_NAME2ID.items()}
+SD14_FILENAME = "sd-v1-4.ckpt"
+SD14_REPO = "CompVis/stable-diffusion-v-1-4-original"
+PRETRAIN_REPO = "scy639/UniBioTransfer"
+def _load_first_stage_from_sd14(model, sd14_path):
+    """Load first_stage_model (VAE) from SD v1.4 checkpoint."""
+    print(f"Loading first_stage_model from {sd14_path}")
+    sd14 = torch.load(str(sd14_path), map_location="cpu")
+    if isinstance(sd14, dict) and "state_dict" in sd14:
+        sd14_sd = sd14["state_dict"]
+    else:
+        sd14_sd = sd14
+    prefixes = ["first_stage_model.", "model.first_stage_model."]
+    fs_sd = {}
+    for prefix in prefixes:
+        for k, v in sd14_sd.items():
+            if k.startswith(prefix):
+                fs_sd[k[len(prefix):]] = v
+        if fs_sd:
+            break
+    if not fs_sd:
+        raise RuntimeError("Could not find first_stage_model weights in SD v1-4 checkpoint.")
+    model.first_stage_model.load_state_dict(fs_sd, strict=True)
+class UniBioTransferModel(LatentDiffusion, PyTorchModelHubMixin):
+    """
+    Hugging Face Hub compatible wrapper for UniBioTransfer.
+    Inherits from LatentDiffusion and adds HF Hub integration via PyTorchModelHubMixin.
+    Usage:
+        # Load model from HF Hub
+        model = UniBioTransferModel.from_pretrained("scy639/UniBioTransfer", task="face")
+        # Push to HF Hub
+        model.push_to_hub("your-repo/UniBioTransfer")
+    Args:
+        config: Model config dict (handled by PyTorchModelHubMixin)
+        task: Task name or ID (face/hair/motion/head)
+        **kwargs: Additional arguments passed to LatentDiffusion
+    """
+    def __init__(self, config=None, task="face", **kwargs):
+        self._task_name = task if isinstance(task, str) else TASK_ID2NAME.get(task, "face")
+        self._task_id = TASK_NAME2ID.get(self._task_name, 0) if isinstance(task, str) else task
+        global_.task = self._task_id
+        if config is None:
+            config = {}
+        super().__init__(**config)
+        self._hf_config = {
+            "task": self._task_name,
+            "task_id": self._task_id,
+        }
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path=None,
+        task="face",
+        device="cuda",
+        download_sd14=True,
+        download_deps=True,
+        cache_dir=None,
+        **kwargs,
+    ):
+        """
+        Load model from Hugging Face Hub.
+        Args:
+            pretrained_model_name_or_path: HF repo ID or local path.
+                Default: "scy639/UniBioTransfer"
+            task: Task name (face/hair/motion/head) or task ID (0/1/2/3)
+            device: Device to load model to ("cuda" or "cpu")
+            download_sd14: Whether to download SD v1.4 VAE weights
+            download_deps: Whether to download other dependencies (ArcFace, DLIB, face_parsing)
+            cache_dir: Cache directory for downloads
+            **kwargs: Additional arguments
+        Returns:
+            UniBioTransferModel: Loaded model
+        """
+        task_id = TASK_NAME2ID.get(task, task) if isinstance(task, str) else task
+        task_name = TASK_ID2NAME.get(task_id, "face")
+        global_.task = task_id
+        if pretrained_model_name_or_path is None:
+            pretrained_model_name_or_path = PRETRAIN_REPO
+        repo_id = pretrained_model_name_or_path
+        cache_dir = Path(cache_dir) if cache_dir else Path(".")
+        ckpt_path = cache_dir / "checkpoints" / "pretrained.ckpt"
+        json_path = cache_dir / "checkpoints" / "pretrained.json"
+        sd14_path = cache_dir / "checkpoints" / SD14_FILENAME
+        arcface_path = cache_dir / "Other_dependencies" / "arcface" / "model_ir_se50.pth"
+        face_parsing_path = cache_dir / "Other_dependencies" / "face_parsing" / "79999_iter.pth"
+        def _download_file(repo, filename, local_path):
+            local_path = Path(local_path)
+            local_path.parent.mkdir(parents=True, exist_ok=True)
+            print(f"Downloading {filename} from {repo}...")
+            token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")
+            hf_hub_download(
+                repo_id=repo,
+                filename=filename,
+                local_dir=str(local_path.parent),
+                local_dir_use_symlinks=False,
+                token=token,
+            )
+        if not ckpt_path.exists():
+            _download_file(repo_id, "checkpoints/pretrained.ckpt", ckpt_path)
+        if not json_path.exists():
+            _download_file(repo_id, "checkpoints/pretrained.json", json_path)
+        if download_sd14 and not sd14_path.exists():
+            _download_file(SD14_REPO, SD14_FILENAME, sd14_path)
+        if download_deps:
+            if not arcface_path.exists():
+                _download_file(repo_id, "Other_dependencies/arcface/model_ir_se50.pth", arcface_path)
+            if not face_parsing_path.exists():
+                _download_file(repo_id, "Other_dependencies/face_parsing/79999_iter.pth", face_parsing_path)
+        seed_everything(42)
+        cur_dir = Path(__file__).parent
+        yaml_path = cur_dir / "LatentDiffusion.yaml"
+        if not yaml_path.exists():
+            yaml_path = Path("LatentDiffusion.yaml")
+        model_config = OmegaConf.load(yaml_path).model
+        model = instantiate_from_config(model_config)
+        with open(json_path, 'r') as f:
+            global_.moduleName_2_adaRank = json.load(f)
+        print(f"Loaded adaptive rank config from {json_path}")
+        _src0 = copy.deepcopy(model.model.diffusion_model)
+        _src1 = copy.deepcopy(model.model.diffusion_model)
+        _src2 = copy.deepcopy(model.model.diffusion_model)
+        _src3 = copy.deepcopy(model.model.diffusion_model)
+        replace_modules_lossless(
+            model.model.diffusion_model,
+            [_src0, _src1, _src2, _src3],
+            [0, 1, 2, 3],
+            parent_name=".model.diffusion_model",
+        )
+        model.ID_proj_out = TaskSpecific_MoE([
+            copy.deepcopy(model.ID_proj_out),
+            copy.deepcopy(model.ID_proj_out),
+            copy.deepcopy(model.ID_proj_out),
+        ], [0, 2, 3])
+        model.landmark_proj_out = TaskSpecific_MoE([
+            copy.deepcopy(model.landmark_proj_out),
+            copy.deepcopy(model.landmark_proj_out),
+            copy.deepcopy(model.landmark_proj_out),
+        ], [0, 2, 3])
+        model.proj_out_source__head = TaskSpecific_MoE([
+            copy.deepcopy(model.proj_out_source__head),
+            copy.deepcopy(model.proj_out_source__head),
+        ], [2, 3])
+        from util_and_constant import REFNET
+        if REFNET.ENABLE:
+            shared_ref = model.model.diffusion_model_refNet
+            src0 = shared_ref
+            src1 = copy.deepcopy(shared_ref)
+            src2 = copy.deepcopy(shared_ref)
+            src3 = copy.deepcopy(shared_ref)
+            replace_modules_lossless(shared_ref, [src0, src1, src2, src3], [0, 1, 2, 3], parent_name=".model.diffusion_model_refNet", for_refnet=True)
+            from ldm.models.diffusion.bank import Bank
+            model.model.bank = Bank(
+                reader=model.model.diffusion_model,
+                writer=model.model.diffusion_model_refNet
+            )
+        print(f"Loading model weights from {ckpt_path}")
+        pl_sd = torch.load(str(ckpt_path), map_location="cpu")
+        if isinstance(pl_sd, dict) and "state_dict" in pl_sd:
+            sd = pl_sd["state_dict"]
+        else:
+            sd = pl_sd
+        m, u = model.load_state_dict(sd, strict=False)
+        if len(m) > 0:
+            print(f"Missing keys: {len(m)}")
+        if len(u) > 0:
+            print(f"Unexpected keys: {len(u)}")
+        _load_first_stage_from_sd14(model, sd14_path)
+        # offload_unused_tasks__LD(model, task_id, method="cpu")
+        model.ptsM_Generator = LandmarkExtractor(include_visualizer=True, img_256_mode=False)
+        cleanup_gpu_memory()
+        # ZeroGPU 兼容：只在 device 不是 "cpu" 且 CUDA 可用时才移动到 GPU
+        # 如果传入 device="cpu"，保持模型在 CPU 上（ZeroGPU 初始化时不碰显卡）
+        if device != "cpu" and torch.cuda.is_available():
+            model = model.to(torch.device(device))
+        else:
+            model = model.to(torch.device("cpu"))
+        model.eval()
+        model._task_id = task_id
+        model._task_name = task_name
+        model._hf_config = {"task": task_name, "task_id": task_id}
+        return model

imports.py ADDED Viewed

	@@ -0,0 +1,8 @@

+#---------------------------------------------------------------------------------------------------------------------
+from util_and_constant import *
+from get_mask import *
+from util_cv2 import *

infer.py ADDED Viewed

	@@ -0,0 +1,366 @@

+# ---------------------------------------------------------     Config  -------------------------------------------------
+num_workers :int = 1
+DDIM_STEPS = 50
+BATCH_SIZE = 1
+FIXED_CODE = False
+# for vis
+SAVE_INTERMEDIATES = True
+NUM_grid_in_a_column = 5
+# ------------------------------------------------------------------------------------------------------------------------
+import argparse
+parser = argparse.ArgumentParser(description="Custom inference for tgt/ref image pairs.")
+parser.add_argument("--task-name", type=str,
+    default='face',
+    help="face|hair|motion|head")
+parser.add_argument("--out-dir", type=str, default='examples/outputs', help="Output directory")
+# option 1: pass 2 paths
+parser.add_argument("--tgt", type=str, default=None, help="Path to target image. if None, will use paths read from --pair-list")
+parser.add_argument("--ref", type=str, default=None, help="Path to reference image")
+# option 2: pass a txt containing paths
+parser.add_argument("--pair-list", type=str, default='examples/inputs.txt', help="white-space-separated list file: tgt_path ref_path")
+args = parser.parse_args()
+#-----------------------------------------set TASK--------------------------------------------------------------------------
+task_name :str = args.task_name
+TASK :int = {
+    'face': 0,
+    'hair': 1,
+    'motion': 2,
+    'head': 3,
+}[task_name]
+print(f'task: {task_name} transfer (ID: {TASK})')
+# ------------------------------------------------------------------------------------------------------------------------
+import sys
+import os
+from pathlib import Path
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+from imports import *
+import torch
+import numpy as np
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm
+from einops import rearrange
+from torchvision.utils import make_grid
+from my_py_lib.image_util import imgs_2_grid_A,img_paths_2_grid_A
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import nullcontext
+import torchvision
+from ldm.models.diffusion.ddpm import LatentDiffusion
+from ldm.util import instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+from Dataset_custom import Dataset_custom
+from MoE import offload_unused_tasks__LD
+from ldm.models.diffusion.ddpm import LandmarkExtractor
+from my_py_lib.torch_util import cleanup_gpu_memory
+from gen_lmk_and_mask import gen_lmk_and_mask
+# ------------------------------------------------------------------------------------------------------------------------
+DDIM_ETA = 0.0
+SCALE = 3.0
+PRECISION = "full"  # "full" or "autocast"
+H = 512
+W = 512
+C = 4
+F = 8
+# ------------------------------------------------------------------------------------------------------------------------
+def load_first_stage_from_sd14(model: LatentDiffusion, sd14_path: Path) -> None:
+    print(f"Loading first_stage_model from {sd14_path}")
+    sd14 = torch.load(str(sd14_path), map_location="cpu")
+    if isinstance(sd14, dict) and "state_dict" in sd14:
+        sd14_sd = sd14["state_dict"]
+    else:
+        sd14_sd = sd14
+    prefixes = ["first_stage_model.", "model.first_stage_model."]
+    fs_sd = {}
+    for prefix in prefixes:
+        for k, v in sd14_sd.items():
+            if k.startswith(prefix):
+                fs_sd[k[len(prefix):]] = v
+        if fs_sd:
+            break
+    if not fs_sd:
+        raise RuntimeError("Could not find first_stage_model weights in SD v1-4 checkpoint.")
+    model.first_stage_model.load_state_dict(fs_sd, strict=True)
+def save_sample_by_decode(x, model, base_path, segment_id, intermediate_num):
+    x = model.decode_first_stage(x)
+    x = torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0)
+    x = x.cpu().permute(0, 2, 3, 1).numpy()
+    for i in range(len(x)):
+        img = Image.fromarray((x[i] * 255).astype(np.uint8))
+        save_path = Path(base_path) / segment_id
+        save_path.mkdir(parents=True, exist_ok=True)
+        img.save(save_path / f"{intermediate_num}.png")
+def get_tensor_clip(normalize=True, toTensor=True):
+    transform_list = []
+    if toTensor:
+        transform_list += [torchvision.transforms.ToTensor()]
+    if normalize:
+        transform_list += [
+            torchvision.transforms.Normalize(
+                (0.48145466, 0.4578275, 0.40821073),
+                (0.26862954, 0.26130258, 0.27577711),
+            )
+        ]
+    return torchvision.transforms.Compose(transform_list)
+def load_model_from_config(ckpt, verbose=1):
+    if 1:
+        ckpt = Path(ckpt)
+        print(f"Loading model from {ckpt}")
+        pl_sd = torch.load(str(ckpt), map_location="cpu")
+        if isinstance(pl_sd, dict) and "state_dict" in pl_sd:
+            sd = pl_sd["state_dict"]
+        else:
+            sd = pl_sd
+    else:
+        print("DEBUG_skip_load_ckpt")
+    if 1:
+        from init_model import get_moe
+        model: LatentDiffusion = get_moe()
+        model.ptsM_Generator = LandmarkExtractor(include_visualizer=True, img_256_mode=False)
+        cleanup_gpu_memory()
+    if 1:
+        m, u = model.load_state_dict(sd, strict=False)
+        if len(m) > 0 and verbose:
+            print("missing keys:")
+            pretty_print_torch_module_keys(m)
+        if len(u) > 0 and verbose:
+            print("unexpected keys:")
+            pretty_print_torch_module_keys(u)
+        load_first_stage_from_sd14(model, SD14_localpath)
+    offload_unused_tasks__LD(model, TASK, method="del") # for save cuda mem
+    model.cuda()
+    model.eval()
+    return model
+def load_pairs(pair_list, tgt, ref):
+    if tgt and ref:
+        pairs = [(tgt, ref), ]
+    elif pair_list:
+        pairs = []
+        with open(pair_list, "r") as f:
+            for line_num, line in enumerate(f, start=1):
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                parts = line.split(" ")
+                if len(parts) != 2:
+                    raise ValueError(f"Invalid pair list line {line_num}: expected white-space-separated tgt/ref. got {parts=}")
+                pairs.append((parts[0], parts[1]))
+    else:
+        raise ValueError("No input pairs provided. Use --tgt/--ref or --pair-list.")
+    print(f"{pairs=}")
+    return pairs
+def un_norm(x):
+    return (x + 1.0) / 2.0
+def un_norm_clip(x1):
+    x = x1 * 1.0
+    reduce = False
+    if len(x.shape) == 3:
+        x = x.unsqueeze(0)
+        reduce = True
+    x[:, 0, :, :] = x[:, 0, :, :] * 0.26862954 + 0.48145466
+    x[:, 1, :, :] = x[:, 1, :, :] * 0.26130258 + 0.4578275
+    x[:, 2, :, :] = x[:, 2, :, :] * 0.27577711 + 0.40821073
+    if reduce:
+        x = x.squeeze(0)
+    return x
+if __name__ == "__main__":
+    pairs = load_pairs(args.pair_list, args.tgt, args.ref)
+    out_dir = Path(args.out_dir)
+    result_path = out_dir / "results"
+    grid_path = out_dir / "grid"
+    inter_path = out_dir / "intermediates"
+    inter_pred_path = inter_path / "pred_x0"
+    inter_noised_path = inter_path / "noised"
+    out_dir.mkdir(parents=False, exist_ok=True)
+    result_path.mkdir(parents=False, exist_ok=True)
+    grid_path.mkdir(parents=False, exist_ok=True)
+    inter_path.mkdir(parents=False, exist_ok=True)
+    if SAVE_INTERMEDIATES:
+        inter_pred_path.mkdir(parents=False, exist_ok=True)
+        inter_noised_path.mkdir(parents=False, exist_ok=True)
+    paths_tgt = [p[0] for p in pairs]
+    paths_ref = [p[1] for p in pairs]
+    gen_lmk_and_mask(paths_tgt + paths_ref)
+    seed_everything(42)
+    model: LatentDiffusion = load_model_from_config(PRETRAIN_CKPT_PATH, )
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    sampler = DDIMSampler(model)
+    dataset = Dataset_custom(
+        "test",
+        task=TASK,
+        paths_tgt=paths_tgt,
+        paths_ref=paths_ref,
+    )
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=BATCH_SIZE,
+        num_workers=num_workers,
+        pin_memory=True,
+        shuffle=False,
+        drop_last=False,
+    )
+    start_code = None
+    if FIXED_CODE:
+        start_code = torch.randn([BATCH_SIZE, C, H // F, W // F], device=device)
+    precision_scope = autocast if PRECISION == "autocast" else nullcontext
+    grids = []
+    grid_stems = []
+    with torch.no_grad():
+        with precision_scope("cuda"):
+            with model.ema_scope():
+                for test_batch, prior, test_model_kwargs, out_stem_batch in tqdm(dataloader):
+                    model.set_task(test_model_kwargs)
+                    bs = test_batch.shape[0]
+                    batch_ = {
+                        **test_model_kwargs,
+                        "GT": torch.zeros_like(test_model_kwargs["inpaint_image"]),
+                    }
+                    batch_, c = model.get_input_and_conditioning(batch_, device=device)
+                    z_inpaint = batch_["z4_inpaint"]
+                    z_inpaint_mask = batch_["tgt_mask_64"]
+                    z_ref = batch_["z_ref"]
+                    z9 = batch_["z9"]
+                    uc = None
+                    if SCALE != 1.0:
+                        uc = model.learnable_vector[TASK].repeat(bs, 1, 1)
+                    shape = [C, H // F, W // F]
+                    local_start_code = start_code
+                    if FIXED_CODE and (local_start_code is None or local_start_code.shape[0] != bs):
+                        local_start_code = torch.randn([bs, C, H // F, W // F], device=device)
+                    samples_ddim, intermediates = sampler.sample(
+                        S=DDIM_STEPS,
+                        conditioning=c,
+                        batch_size=bs,
+                        shape=shape,
+                        verbose=False,
+                        unconditional_guidance_scale=SCALE,
+                        unconditional_conditioning=uc,
+                        eta=DDIM_ETA,
+                        x_T=local_start_code,
+                        log_every_t=100,
+                        z_inpaint=z_inpaint,
+                        z_inpaint_mask=z_inpaint_mask,
+                        z_ref=z_ref,
+                        z9=z9,
+                    )
+                    if SAVE_INTERMEDIATES:
+                        intermediate_pred_x0 = intermediates["pred_x0"]
+                        intermediate_noised = intermediates["x_inter"]
+                        for i in range(len(intermediate_pred_x0)):
+                            for j in range(bs):
+                                stem = f"{out_stem_batch[j]}"
+                                save_sample_by_decode(
+                                    intermediate_pred_x0[i][j : j + 1],
+                                    model,
+                                    inter_pred_path,
+                                    stem,
+                                    i,
+                                )
+                                save_sample_by_decode(
+                                    intermediate_noised[i][j : j + 1],
+                                    model,
+                                    inter_noised_path,
+                                    stem,
+                                    i,
+                                )
+                    x_samples_ddim = model.decode_first_stage(samples_ddim)
+                    x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                    x_samples_ddim = x_samples_ddim.cpu().permute(0, 2, 3, 1).numpy()
+                    x_checked_image_torch = torch.from_numpy(x_samples_ddim).permute(0, 3, 1, 2)
+                    for i, x_sample in enumerate(x_checked_image_torch):
+                        stem = f"{out_stem_batch[i]}"
+                        out_path = result_path / f"{stem}.png"
+                        img = Image.fromarray((x_sample.permute(1, 2, 0).numpy() * 255).astype(np.uint8))
+                        img.save(out_path)
+                        print(f"{out_path=}")
+                    for i, x_sample in enumerate(x_checked_image_torch):
+                        all_img = []
+                        all_img.append(un_norm(test_batch[i]).cpu())
+                        if TASK != 2:
+                            ref_img = test_model_kwargs["ref_imgs"].squeeze(1)
+                            ref_img = torchvision.transforms.Resize([512, 512])(ref_img)
+                            ref_img = un_norm_clip(ref_img[i]).cpu()
+                        else:
+                            ref_img = un_norm(test_model_kwargs["ref512"].squeeze(1)[i]).cpu()
+                        all_img.append(ref_img)
+                        all_img.append(x_sample)
+                        grid = torch.stack(all_img, 0)
+                        grid = make_grid(grid)
+                        grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy()
+                        img = Image.fromarray(grid.astype(np.uint8))
+                        stem = f"{out_stem_batch[i]}"
+                        path_save_img = grid_path / f"grid-{stem}.jpg"
+                        img.save(path_save_img)
+                        print(f"{path_save_img=}")
+                        grids.append(img)
+                        grid_stems.append(stem)
+                        if len(grids) >= NUM_grid_in_a_column:
+                                stem_start = grid_stems[0]
+                                stem_end = grid_stems[-1]
+                                grid_column = imgs_2_grid_A(
+                                    grids,
+                                    grid_layout='column',
+                                    grid_path=os.path.join(grid_path, f"{stem_start}--{stem_end}.jpg"),
+                                )
+                                grids = []
+                                grid_stems = []
+                    model.unset_task()
+    print(f"Your samples are ready and waiting for you here: {out_dir}")

infer_hf.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+High-level inference pipeline for UniBioTransfer.
+Designed for easy use in Hugging Face Spaces and other applications.
+ZeroGPU Compatible:
+- Supports CPU initialization (device="cpu")
+- Dynamically switches to CUDA during inference when called from @spaces.GPU
+"""
+from pathlib import Path
+import torch
+import numpy as np
+from PIL import Image
+import cv2
+import global_
+from hf_model import UniBioTransferModel, TASK_NAME2ID, TASK_ID2NAME
+from ldm.models.diffusion.ddim import DDIMSampler
+from pytorch_lightning import seed_everything
+DDIM_STEPS_DEFAULT = 50
+SCALE_DEFAULT = 3.0
+H, W, C, F = 512, 512, 4, 8
+class UniBioTransferPipeline:
+    """
+    High-level pipeline for UniBioTransfer inference.
+    """
+    def __init__(self, model, task="face", device="cpu"):
+        """
+        Initialize pipeline with a loaded model.
+        """
+        self.model = model
+        self.task = task
+        self.task_id = TASK_NAME2ID.get(task, task) if isinstance(task, str) else task
+        self._init_device = device
+        global_.task = self.task_id
+        self.model.task = self.task_id
+        self.sampler = DDIMSampler(model)
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id="scy639/UniBioTransfer",
+        task="face",
+        device="cpu",
+        cache_dir=None,
+        **kwargs,
+    ):
+        """
+        Load pipeline from Hugging Face Hub.
+        """
+        model = UniBioTransferModel.from_pretrained(
+            pretrained_model_name_or_path=repo_id,
+            task=task,
+            device=device,
+            cache_dir=cache_dir,
+            **kwargs,
+        )
+        return cls(model, task=task, device=device)
+    def set_task(self, task):
+        """Switch to a different task."""
+        self.task = task
+        self.task_id = TASK_NAME2ID.get(task, task) if isinstance(task, str) else task
+        global_.task = self.task_id
+        self.model.task = self.task_id
+    def __call__(
+        self,
+        tgt_image,
+        ref_image,
+        ddim_steps=DDIM_STEPS_DEFAULT,
+        scale=SCALE_DEFAULT,
+        seed=42,
+        num_images=1,
+    ):
+        """
+        Run inference on a pair of images.
+        """
+        seed_everything(seed)
+        tgt_img = self._load_image(tgt_image)
+        ref_img = self._load_image(ref_image)
+        tgt_img = self._resize_image(tgt_img, (H, W))
+        ref_img = self._resize_image(ref_img, (H, W))
+        result_tensors = self._run_inference(tgt_img, ref_img, ddim_steps, scale, num_images)
+        result_imgs = [self._postprocess(result_tensors[i]) for i in range(result_tensors.shape[0])]
+        return result_imgs
+    def _load_image(self, img):
+        """Load image from various formats."""
+        if isinstance(img, Image.Image):
+            return img.convert("RGB")
+        elif isinstance(img, np.ndarray):
+            return Image.fromarray(img).convert("RGB")
+        elif isinstance(img, (str, Path)):
+            return Image.open(img).convert("RGB")
+        else:
+            raise ValueError(f"Unsupported image type: {type(img)}")
+    def _resize_image(self, img, size):
+        """Resize image to target size."""
+        if img.size != size:
+            img = img.resize(size, Image.LANCZOS)
+        return img
+    def _run_inference(self, tgt_img, ref_img, ddim_steps, scale, num_images):
+        """
+        Run diffusion sampling.
+        完全复用 infer.py 的逻辑，使用 dataloader。
+        """
+        from Dataset_custom import Dataset_custom
+        from gen_lmk_and_mask import gen_lmk_and_mask
+        import tempfile
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tgt_path = Path(tmpdir) / "tgt.png"
+            ref_path = Path(tmpdir) / "ref.png"
+            tgt_img.save(tgt_path)
+            ref_img.save(ref_path)
+            gen_lmk_and_mask([str(tgt_path), str(ref_path)], write_cache=True)
+            dataset = Dataset_custom(
+                "test",
+                task=self.task_id,
+                paths_tgt=[str(tgt_path)],
+                paths_ref=[str(ref_path)],
+            )
+            dataloader = torch.utils.data.DataLoader(
+                dataset,
+                batch_size=1,
+                num_workers=1,
+                pin_memory=True,
+                shuffle=False,
+                drop_last=False,
+            )
+            run_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            self.model = self.model.to(run_device)
+            with torch.no_grad():
+                for test_batch, prior, test_model_kwargs, out_stem_batch in dataloader:
+                    test_batch = test_batch.to(run_device)
+                    if test_batch.shape[0] == 1:
+                        test_batch = test_batch.repeat(num_images, 1, 1, 1)
+                    if isinstance(prior, torch.Tensor):
+                        prior = prior.to(run_device)
+                        if prior.shape[0] == 1:
+                            prior = prior.repeat(num_images, 1, 1, 1)
+                    for k, v in test_model_kwargs.items():
+                        if isinstance(v, torch.Tensor):
+                            v = v.to(run_device)
+                            if v.shape[0] == 1:
+                                repeats = [num_images] + [1] * (v.ndim - 1)
+                                v = v.repeat(*repeats)
+                            test_model_kwargs[k] = v
+                        elif isinstance(v, dict):
+                            new_v = {}
+                            for kk, vv in v.items():
+                                if isinstance(vv, torch.Tensor):
+                                    vv = vv.to(run_device)
+                                    if vv.shape[0] == 1:
+                                        repeats = [num_images] + [1] * (vv.ndim - 1)
+                                        vv = vv.repeat(*repeats)
+                                    new_v[kk] = vv
+                                else:
+                                    new_v[kk] = vv
+                            test_model_kwargs[k] = new_v
+                        elif isinstance(v, list):
+                            test_model_kwargs[k] = v * num_images
+                    self.model.set_task(test_model_kwargs)
+                    bs = num_images
+                    batch_ = {
+                        **test_model_kwargs,
+                        "GT": torch.zeros(num_images, *test_model_kwargs["inpaint_image"].shape[1:], device=run_device),
+                    }
+                    batch_, c = self.model.get_input_and_conditioning(batch_, device=run_device)
+                    z_inpaint = batch_["z4_inpaint"]
+                    z_inpaint_mask = batch_["tgt_mask_64"]
+                    z_ref = batch_["z_ref"]
+                    z9 = batch_["z9"]
+                    uc = None
+                    if scale != 1.0:
+                        uc = self.model.learnable_vector[self.task_id].repeat(bs, 1, 1)
+                    shape = [C, H // F, W // F]
+                    start_code = None
+                    samples_ddim, _ = self.sampler.sample(
+                        S=ddim_steps,
+                        conditioning=c,
+                        batch_size=bs,
+                        shape=shape,
+                        verbose=False,
+                        unconditional_guidance_scale=scale,
+                        unconditional_conditioning=uc,
+                        eta=0.0,
+                        x_T=start_code,
+                        log_every_t=100,
+                        z_inpaint=z_inpaint,
+                        z_inpaint_mask=z_inpaint_mask,
+                        z_ref=z_ref,
+                        z9=z9,
+                    )
+                    x_samples_ddim = self.model.decode_first_stage(samples_ddim)
+                    x_samples_ddim = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+                    self.model.unset_task()
+                    return x_samples_ddim
+    def _postprocess(self, tensor):
+        """Convert model output tensor to PIL Image."""
+        img_array = tensor.cpu().permute(1, 2, 0).numpy()
+        img_array = (img_array * 255).astype(np.uint8)
+        return Image.fromarray(img_array)
+def infer_single(
+    tgt_path,
+    ref_path,
+    task="face",
+    output_path=None,
+    ddim_steps=DDIM_STEPS_DEFAULT,
+    scale=SCALE_DEFAULT,
+    device="cuda",
+):
+    """
+    Convenience function for single inference.
+    """
+    pipeline = UniBioTransferPipeline.from_pretrained(task=task, device=device)
+    result = pipeline(tgt_path, ref_path, ddim_steps=ddim_steps, scale=scale)
+    if output_path is not None:
+        result.save(output_path)
+        print(f"Saved result to {output_path}")
+    return result
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="UniBioTransfer inference")
+    parser.add_argument("--task", type=str, default="face", choices=["face", "hair", "motion", "head"])
+    parser.add_argument("--tgt", type=str, required=True, help="Path to target image")
+    parser.add_argument("--ref", type=str, required=True, help="Path to reference image")
+    parser.add_argument("--out", type=str, default="result.png", help="Output path")
+    parser.add_argument("--ddim-steps", type=int, default=50)
+    parser.add_argument("--scale", type=float, default=3.0)
+    parser.add_argument("--device", type=str, default="cuda")
+    args = parser.parse_args()
+    result = infer_single(
+        args.tgt,
+        args.ref,
+        task=args.task,
+        output_path=args.out,
+        ddim_steps=args.ddim_steps,
+        scale=args.scale,
+        device=args.device,
+    )
+    print(f"Inference complete. Result shape: {result.size}")

init_model.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import sys,os
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+if __name__=='__main__': sys.path.append(os.path.abspath(os.path.join(cur_dir, '..')))
+from imports import *
+import json
+import argparse, os, sys, glob
+import cv2
+import torch
+import numpy as np
+from MoE import *
+from multiTask_model import *
+from lora_layers import *
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm, trange
+from itertools import islice
+from einops import rearrange
+from torchvision.utils import make_grid
+from my_py_lib.image_util import imgs_2_grid_A,img_paths_2_grid_A
+import time
+import copy
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import contextmanager, nullcontext
+import torchvision
+from ldm.models.diffusion.ddpm import LatentDiffusion
+from ldm.models.diffusion.bank import Bank
+from ldm.util import instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+from transformers import AutoFeatureExtractor
+# import clip
+from torchvision.transforms import Resize
+from fnmatch import fnmatch
+from PIL import Image
+from torchvision.transforms import PILToTensor
+#----------------------------------------------------------------------------
+def get_moe():
+    if 1:
+        seed_everything(42)
+        # torch.cuda.set_device(opt.device_ID)
+        model :LatentDiffusion = instantiate_from_config(OmegaConf.load(f"LatentDiffusion.yaml").model,)
+        if REFNET.ENABLE:
+            assert model.model.diffusion_model_refNet.is_refNet
+        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        device = torch.device("cpu")
+        model = model.to(device)
+    if FOR_upcycle_ckpt_GEN_or_USE:
+        del model.ptsM_Generator
+    def average_module_weight(
+        src_modules: list,
+    ):
+        """Average the weights of multiple modules"""
+        if not src_modules:
+            return None
+        # Get the state dict of the first module as template
+        avg_state_dict = {}
+        first_state_dict = src_modules[0].state_dict()
+        # Initialize with zeros
+        for key in first_state_dict:
+            avg_state_dict[key] = torch.zeros_like(first_state_dict[key])
+        # Sum
+        for module in src_modules:
+            module_state_dict = module.state_dict()
+            for key in avg_state_dict:
+                avg_state_dict[key] += module_state_dict[key]
+        # Average
+        for key in avg_state_dict:
+            avg_state_dict[key] /= len(src_modules)
+        return avg_state_dict
+    def recursive_average_module_weight(
+        tgt_module: nn.Module,
+        src_modules: list,
+        cb,
+    ):
+        """
+        Recursively find modules and replace with averaged weights based on callback
+        """
+        for name, child in tgt_module.named_children():
+            if 1:    # Get corresponding modules from source models
+                src_child_modules = []
+                for src_module in src_modules:
+                    src_child = getattr(src_module, name)
+                    assert src_child is not None,name
+                    src_child_modules.append(src_child)
+            # assert not isinstance(child, TaskSpecific_MoE)
+            if cb(child, name, tgt_module):
+                print(f"[recursive_average_module_weight] {name=} child: {repr(child)[:50]} tgt_module: {repr(tgt_module)[:50]}")
+                # Average & load
+                avg_weights = average_module_weight(src_child_modules)
+                child.load_state_dict(avg_weights)
+            else:
+                recursive_average_module_weight(child, src_child_modules, cb)
+        return tgt_module
+    def replace_module_with_TaskSpecific(
+        tgt_module: nn.Module,# tgt module
+        src_modules: list,
+        cb,
+        parent_name: str = "",
+        depth :int = 0,
+    ):
+        for name, child in tgt_module.named_children():
+            if 1:   # Get corresponding modules from source models
+                src_child_modules = []
+                for src_module in src_modules:
+                    src_child = getattr(src_module, name)
+                    assert src_child is not None,name
+                    src_child_modules.append(src_child)
+            assert not isinstance(child, TaskSpecific_MoE)
+            full_name = f"{parent_name}.{name}"
+            if cb(child, name, full_name, tgt_module):
+                print(f"[replace_module_with_TaskSpecific] {name=} child: {repr(child)[:50]} tgt_module: {repr(tgt_module)[:50]}")
+                setattr(tgt_module, name, TaskSpecific_MoE(src_child_modules,TASKS))
+            else:
+                if depth<=0:
+                    replace_module_with_TaskSpecific(child, src_child_modules,cb,parent_name=full_name,depth=depth+1)
+        return tgt_module
+    if not FOR_upcycle_ckpt_GEN_or_USE:
+        modelMOE :LatentDiffusion = model
+        del model
+        if 1:  # ensure distinct module instances per task (avoid shared identities)
+            with open(PRETRAIN_JSON_PATH, 'r') as f: global_.moduleName_2_adaRank = json.load(f)
+            print(f"loaded from {PRETRAIN_JSON_PATH=}")
+            _src0 = copy.deepcopy(modelMOE.model.diffusion_model)
+            _src1 = copy.deepcopy(modelMOE.model.diffusion_model)
+            _src2 = copy.deepcopy(modelMOE.model.diffusion_model)
+            _src3 = copy.deepcopy(modelMOE.model.diffusion_model)
+            replace_modules_lossless(
+                modelMOE.model.diffusion_model,
+                [ _src0, _src1, _src2, _src3 ],
+                [0,1,2,3],
+                parent_name=".model.diffusion_model",
+            )
+            # Build-time dummy wrapping for task-specific heads so that ckpt keys match
+            modelMOE.ID_proj_out = TaskSpecific_MoE([
+                copy.deepcopy(modelMOE.ID_proj_out),
+                copy.deepcopy(modelMOE.ID_proj_out),
+                copy.deepcopy(modelMOE.ID_proj_out),
+            ], [0,2,3])
+            modelMOE.landmark_proj_out = TaskSpecific_MoE([
+                copy.deepcopy(modelMOE.landmark_proj_out),
+                copy.deepcopy(modelMOE.landmark_proj_out),
+                copy.deepcopy(modelMOE.landmark_proj_out),
+            ], [0,2,3])
+            modelMOE.proj_out_source__head = TaskSpecific_MoE([
+                copy.deepcopy(modelMOE.proj_out_source__head),
+                copy.deepcopy(modelMOE.proj_out_source__head),
+            ], [2,3])
+            # Upcycle single refNet using three source refNets, and keep only one
+            if REFNET.ENABLE:
+                shared_ref = modelMOE.model.diffusion_model_refNet
+                src0 = shared_ref
+                src1 = copy.deepcopy(shared_ref)
+                src2 = copy.deepcopy(shared_ref)
+                src3 = copy.deepcopy(shared_ref)
+                replace_modules_lossless(shared_ref, [src0, src1, src2, src3],[0,1,2,3], parent_name=".model.diffusion_model_refNet", for_refnet=True)
+        # load from ./modelMOE.ckpt
+        time.sleep(20*rank_)
+        print(f"ckpt load over. m,u:")
+    # Initialize bank here (after model structure is finalized)
+    if REFNET.ENABLE :
+        modelMOE.model.bank = Bank(reader=modelMOE.model.diffusion_model,writer=modelMOE.model.diffusion_model_refNet)
+    if __name__=='__main__':
+        for key in sorted( get_representative_moduleNames(modelMOE.state_dict().keys()) ):
+            print(f"  - {key}")
+    return modelMOE

ldm/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import numpy as np
+class LambdaWarmUpCosineScheduler:
+    """
+    note: use with a base_lr of 1.0
+    """
+    def __init__(self, warm_up_steps, lr_min, lr_max, lr_start, max_decay_steps, verbosity_interval=0):
+        self.lr_warm_up_steps = warm_up_steps
+        self.lr_start = lr_start
+        self.lr_min = lr_min
+        self.lr_max = lr_max
+        self.lr_max_decay_steps = max_decay_steps
+        self.last_lr = 0.
+        self.verbosity_interval = verbosity_interval
+    def schedule(self, n, **kwargs):
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_lr}")
+        if n < self.lr_warm_up_steps:
+            lr = (self.lr_max - self.lr_start) / self.lr_warm_up_steps * n + self.lr_start
+            self.last_lr = lr
+            return lr
+        else:
+            t = (n - self.lr_warm_up_steps) / (self.lr_max_decay_steps - self.lr_warm_up_steps)
+            t = min(t, 1.0)
+            lr = self.lr_min + 0.5 * (self.lr_max - self.lr_min) * (
+                    1 + np.cos(t * np.pi))
+            self.last_lr = lr
+            return lr
+    def __call__(self, n, **kwargs):
+        return self.schedule(n,**kwargs)
+class LambdaWarmUpCosineScheduler2:
+    """
+    supports repeated iterations, configurable via lists
+    note: use with a base_lr of 1.0.
+    """
+    def __init__(self, warm_up_steps, f_min, f_max, f_start, cycle_lengths, verbosity_interval=0):
+        assert len(warm_up_steps) == len(f_min) == len(f_max) == len(f_start) == len(cycle_lengths)
+        self.lr_warm_up_steps = warm_up_steps
+        self.f_start = f_start
+        self.f_min = f_min
+        self.f_max = f_max
+        self.cycle_lengths = cycle_lengths
+        self.cum_cycles = np.cumsum([0] + list(self.cycle_lengths))
+        self.last_f = 0.
+        self.verbosity_interval = verbosity_interval
+    def find_in_interval(self, n):
+        interval = 0
+        for cl in self.cum_cycles[1:]:
+            if n <= cl:
+                return interval
+            interval += 1
+    def schedule(self, n, **kwargs):
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                                                       f"current cycle {cycle}")
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            return f
+        else:
+            t = (n - self.lr_warm_up_steps[cycle]) / (self.cycle_lengths[cycle] - self.lr_warm_up_steps[cycle])
+            t = min(t, 1.0)
+            f = self.f_min[cycle] + 0.5 * (self.f_max[cycle] - self.f_min[cycle]) * (
+                    1 + np.cos(t * np.pi))
+            self.last_f = f
+            return f
+    def __call__(self, n, **kwargs):
+        return self.schedule(n, **kwargs)
+class LambdaLinearScheduler(LambdaWarmUpCosineScheduler2):
+    def schedule(self, n, **kwargs):# n is the step index
+        cycle = self.find_in_interval(n)
+        n = n - self.cum_cycles[cycle]
+        if self.verbosity_interval > 0:
+            if n % self.verbosity_interval == 0: print(f"current step: {n}, recent lr-multiplier: {self.last_f}, "
+                                                       f"current cycle {cycle}")
+        if n < self.lr_warm_up_steps[cycle]:
+            f = (self.f_max[cycle] - self.f_start[cycle]) / self.lr_warm_up_steps[cycle] * n + self.f_start[cycle]
+            self.last_f = f
+            # print(f"0 {n=} {f=}")
+            return f
+        else:
+            f = self.f_min[cycle] + (self.f_max[cycle] - self.f_min[cycle]) * (self.cycle_lengths[cycle] - n) / (self.cycle_lengths[cycle])
+            self.last_f = f
+            # print(f"1 {n=} {f=}")
+            return f

ldm/models/autoencoder.py ADDED Viewed

	@@ -0,0 +1,443 @@

+import torch
+import pytorch_lightning as pl
+import torch.nn.functional as F
+from contextlib import contextmanager
+from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+from ldm.modules.diffusionmodules.model import Encoder, Decoder
+from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
+from ldm.util import instantiate_from_config
+class VQModel(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 n_embed,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 batch_resize_range=None,
+                 scheduler_config=None,
+                 lr_g_factor=1.0,
+                 remap=None,
+                 sane_index_shape=False, # tell vector quantizer to return indices as bhw
+                 use_ema=False
+                 ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.n_embed = n_embed
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25,
+                                        remap=remap,
+                                        sane_index_shape=sane_index_shape)
+        self.quant_conv = torch.nn.Conv2d(ddconfig["z_channels"], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.batch_resize_range = batch_resize_range
+        if self.batch_resize_range is not None:
+            print(f"{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.")
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self)
+            print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.scheduler_config = scheduler_config
+        self.lr_g_factor = lr_g_factor
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f"{context}: Restored training weights")
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+            print(f"Unexpected Keys: {unexpected}")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def encode_to_prequant(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input, return_pred_indices=False):
+        quant, diff, (_,_,ind) = self.encode(input)
+        dec = self.decode(quant)
+        if return_pred_indices:
+            return dec, diff, ind
+        return dec, diff
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        if self.batch_resize_range is not None:
+            lower_size = self.batch_resize_range[0]
+            upper_size = self.batch_resize_range[1]
+            if self.global_step <= 4:
+                # do the first few batches with max size to avoid later oom
+                new_resize = upper_size
+            else:
+                new_resize = np.random.choice(np.arange(lower_size, upper_size+16, 16))
+            if new_resize != x.shape[2]:
+                x = F.interpolate(x, size=new_resize, mode="bicubic")
+            x = x.detach()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        # https://github.com/pytorch/pytorch/issues/37142
+        # try not to fool the heuristics
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train",
+                                            predicted_indices=ind)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return aeloss
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(qloss, x, xrec, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            log_dict_ema = self._validation_step(batch, batch_idx, suffix="_ema")
+        return log_dict
+    def _validation_step(self, batch, batch_idx, suffix=""):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+        aeloss, log_dict_ae = self.loss(qloss, x, xrec, 0,
+                                        self.global_step,
+                                        last_layer=self.get_last_layer(),
+                                        split="val"+suffix,
+                                        predicted_indices=ind
+                                        )
+        discloss, log_dict_disc = self.loss(qloss, x, xrec, 1,
+                                            self.global_step,
+                                            last_layer=self.get_last_layer(),
+                                            split="val"+suffix,
+                                            predicted_indices=ind
+                                            )
+        rec_loss = log_dict_ae[f"val{suffix}/rec_loss"]
+        self.log(f"val{suffix}/rec_loss", rec_loss,
+                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        self.log(f"val{suffix}/aeloss", aeloss,
+                   prog_bar=True, logger=True, on_step=False, on_epoch=True, sync_dist=True)
+        if version.parse(pl.__version__) >= version.parse('1.4.0'):
+            del log_dict_ae[f"val{suffix}/rec_loss"]
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr_d = self.learning_rate
+        lr_g = self.lr_g_factor*self.learning_rate
+        print("lr_d", lr_d)
+        print("lr_g", lr_g)
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quantize.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr_g, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr_d, betas=(0.5, 0.9))
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    'scheduler': LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+                {
+                    'scheduler': LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+            ]
+            return [opt_ae, opt_disc], scheduler
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if only_inputs:
+            log["inputs"] = x
+            return log
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log["inputs"] = x
+        log["reconstructions"] = xrec
+        if plot_ema:
+            with self.ema_scope():
+                xrec_ema, _ = self(x)
+                if x.shape[1] > 3: xrec_ema = self.to_rgb(xrec_ema)
+                log["reconstructions_ema"] = xrec_ema
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class VQModelInterface(VQModel):
+    def __init__(self, embed_dim, *args, **kwargs):
+        super().__init__(embed_dim=embed_dim, *args, **kwargs)
+        self.embed_dim = embed_dim
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+    def decode(self, h, force_not_quantize=False):
+        # also go through quantization layer
+        if not force_not_quantize:
+            quant, emb_loss, info = self.quantize(h)
+        else:
+            quant = h
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+class AutoencoderKL(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return aeloss
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                                last_layer=self.get_last_layer(), split="train")
+            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val")
+        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+        log["inputs"] = x
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class IdentityFirstStage(torch.nn.Module):
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
+        super().__init__()
+    def encode(self, x, *args, **kwargs):
+        return x
+    def decode(self, x, *args, **kwargs):
+        return x
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+    def forward(self, x, *args, **kwargs):
+        return x

ldm/models/diffusion/__init__.py ADDED Viewed

File without changes

ldm/models/diffusion/bank.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from .misc_4ddpm import *
+from ldm.modules.attention import BasicTransformerBlock
+class Bank:
+    def __init__(self,reader:nn.Module, writer:nn.Module) -> None:
+        """
+        For the DFS model, mark every BasicTransformerBlock with name_4bank and isReader_4bank flags.
+        Similar logic applies for the writer while checking for BasicTransformerBlock instances.
+        """
+        self.name2data = {}
+        self.name2count = {}  # track how many times each name has been retrieved
+        self.WHEN_clear_a_field = 2  # clear the entry after this many gets
+        skip_names = [
+            'input_blocks.1.1.transformer_blocks.0',
+            'input_blocks.2.1.transformer_blocks.0',
+            # 'input_blocks.4.1.transformer_blocks.0',
+            # 'input_blocks.5.1.transformer_blocks.0',
+            # 'input_blocks.7.1.transformer_blocks.0',
+            # 'input_blocks.8.1.transformer_blocks.0',
+            ##-----------all middle and output_blocks (everything outside input_blocks)----
+            'middle_block.1.transformer_blocks.0',
+            'output_blocks.3.1.transformer_blocks.0',
+            'output_blocks.4.1.transformer_blocks.0',
+            'output_blocks.5.1.transformer_blocks.0',
+            'output_blocks.6.1.transformer_blocks.0',
+            'output_blocks.7.1.transformer_blocks.0',
+            'output_blocks.8.1.transformer_blocks.0',
+            'output_blocks.9.1.transformer_blocks.0',
+            'output_blocks.10.1.transformer_blocks.0',
+            'output_blocks.11.1.transformer_blocks.0',
+        ]
+        # print(f"{skip_names=}")
+        l_name = []
+        for name, _module in writer.named_modules():
+            if isinstance(_module, BasicTransformerBlock):
+                if DEBUG:
+                    print(f"{name=}")
+                if name in skip_names:
+                    # print(f"skip {name=}")
+                    continue
+                _module.bank = self
+                _module.name4bank = name
+                _module.isReader_4bank = False
+                l_name.append(name)
+        # print(f"{l_name=}")
+        for name, _module in reader.named_modules():
+            if isinstance(_module, BasicTransformerBlock):
+                if name not in l_name:
+                    continue
+                _module.bank = self
+                _module.name4bank = name
+                _module.isReader_4bank = True
+    def set(self,name,data):
+        self.name2data[name] = data
+        # self.name2count[name] = 0
+    def get(self,name):
+        printC('bank get', name)
+        if name in self.name2data:
+            if name not in self.name2count:
+                self.name2count[name] = 0
+            self.name2count[name] += 1
+            data = self.name2data[name]
+            if self.name2count[name] >= self.WHEN_clear_a_field: # once the max get count is reached, remove the entry
+                del self.name2data[name]
+                del self.name2count[name]
+            return data
+        raise Exception(f"{name}\n{list(self.name2data.keys())}")
+        return None
+    def clear(self,):
+        printC('clear')
+        printC('mean ct:', sum( self.name2count.values() ) / len( self.name2count.values() ) if len( self.name2count.values() )>0 else 'null'   )
+        self.name2data.clear()
+        self.name2count.clear()

ldm/models/diffusion/classifier.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import os
+import torch
+import pytorch_lightning as pl
+from omegaconf import OmegaConf
+from torch.nn import functional as F
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import LambdaLR
+from copy import deepcopy
+from einops import rearrange
+from glob import glob
+from natsort import natsorted
+from ldm.modules.diffusionmodules.openaimodel import EncoderUNetModel, UNetModel
+from ldm.util import log_txt_as_img, default, ismap, instantiate_from_config
+__models__ = {
+    'class_label': EncoderUNetModel,
+    'segmentation': UNetModel
+}
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class NoisyLatentImageClassifier(pl.LightningModule):
+    def __init__(self,
+                 diffusion_path,
+                 num_classes,
+                 ckpt_path=None,
+                 pool='attention',
+                 label_key=None,
+                 diffusion_ckpt_path=None,
+                 scheduler_config=None,
+                 weight_decay=1.e-2,
+                 log_steps=10,
+                 monitor='val/loss',
+                 *args,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_classes = num_classes
+        # get latest config of diffusion model
+        diffusion_config = natsorted(glob(os.path.join(diffusion_path, 'configs', '*-project.yaml')))[-1]
+        self.diffusion_config = OmegaConf.load(diffusion_config).model
+        self.diffusion_config.params.ckpt_path = diffusion_ckpt_path
+        self.load_diffusion()
+        self.monitor = monitor
+        self.numd = self.diffusion_model.first_stage_model.encoder.num_resolutions - 1
+        self.log_time_interval = self.diffusion_model.num_timesteps // log_steps
+        self.log_steps = log_steps
+        self.label_key = label_key if not hasattr(self.diffusion_model, 'cond_stage_key') \
+            else self.diffusion_model.cond_stage_key
+        assert self.label_key is not None, 'label_key neither in diffusion model nor in model.params'
+        if self.label_key not in __models__:
+            raise NotImplementedError()
+        self.load_classifier(ckpt_path, pool)
+        self.scheduler_config = scheduler_config
+        self.use_scheduler = self.scheduler_config is not None
+        self.weight_decay = weight_decay
+    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
+        sd = torch.load(path, map_location="cpu")
+        if "state_dict" in list(sd.keys()):
+            sd = sd["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
+            sd, strict=False)
+        print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            print(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            print(f"Unexpected Keys: {unexpected}")
+    def load_diffusion(self):
+        model = instantiate_from_config(self.diffusion_config)
+        self.diffusion_model = model.eval()
+        self.diffusion_model.train = disabled_train
+        for param in self.diffusion_model.parameters():
+            param.requires_grad = False
+    def load_classifier(self, ckpt_path, pool):
+        model_config = deepcopy(self.diffusion_config.params.unet_config.params)
+        model_config.in_channels = self.diffusion_config.params.unet_config.params.out_channels
+        model_config.out_channels = self.num_classes
+        if self.label_key == 'class_label':
+            model_config.pool = pool
+        self.model = __models__[self.label_key](**model_config)
+        if ckpt_path is not None:
+            print('#####################################################################')
+            print(f'load from ckpt "{ckpt_path}"')
+            print('#####################################################################')
+            self.init_from_ckpt(ckpt_path)
+    @torch.no_grad()
+    def get_x_noisy(self, x, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x))
+        continuous_sqrt_alpha_cumprod = None
+        if self.diffusion_model.use_continuous_noise:
+            continuous_sqrt_alpha_cumprod = self.diffusion_model.sample_continuous_noise_level(x.shape[0], t + 1)
+            # todo: make sure t+1 is correct here
+        return self.diffusion_model.q_sample(x_start=x, t=t, noise=noise,
+                                             continuous_sqrt_alpha_cumprod=continuous_sqrt_alpha_cumprod)
+    def forward(self, x_noisy, t, *args, **kwargs):
+        return self.model(x_noisy, t)
+    @torch.no_grad()
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = rearrange(x, 'b h w c -> b c h w')
+        x = x.to(memory_format=torch.contiguous_format).float()
+        return x
+    @torch.no_grad()
+    def get_conditioning(self, batch, k=None):
+        if k is None:
+            k = self.label_key
+        assert k is not None, 'Needs to provide label key'
+        targets = batch[k].to(self.device)
+        if self.label_key == 'segmentation':
+            targets = rearrange(targets, 'b h w c -> b c h w')
+            for down in range(self.numd):
+                h, w = targets.shape[-2:]
+                targets = F.interpolate(targets, size=(h // 2, w // 2), mode='nearest')
+            # targets = rearrange(targets,'b c h w -> b h w c')
+        return targets
+    def compute_top_k(self, logits, labels, k, reduction="mean"):
+        _, top_ks = torch.topk(logits, k, dim=1)
+        if reduction == "mean":
+            return (top_ks == labels[:, None]).float().sum(dim=-1).mean().item()
+        elif reduction == "none":
+            return (top_ks == labels[:, None]).float().sum(dim=-1)
+    def on_train_epoch_start(self):
+        # save some memory
+        self.diffusion_model.model.to('cpu')
+    @torch.no_grad()
+    def write_logs(self, loss, logits, targets):
+        log_prefix = 'train' if self.training else 'val'
+        log = {}
+        log[f"{log_prefix}/loss"] = loss.mean()
+        log[f"{log_prefix}/acc@1"] = self.compute_top_k(
+            logits, targets, k=1, reduction="mean"
+        )
+        log[f"{log_prefix}/acc@5"] = self.compute_top_k(
+            logits, targets, k=5, reduction="mean"
+        )
+        self.log_dict(log, prog_bar=False, logger=True, on_step=self.training, on_epoch=True)
+        self.log('loss', log[f"{log_prefix}/loss"], prog_bar=True, logger=False)
+        self.log('global_step', self.global_step, logger=False, on_epoch=False, prog_bar=True)
+        lr = self.optimizers().param_groups[0]['lr']
+        self.log('lr_abs', lr, on_step=True, logger=True, on_epoch=False, prog_bar=True)
+    def shared_step(self, batch, t=None):
+        x, *_ = self.diffusion_model.get_input(batch, k=self.diffusion_model.first_stage_key)
+        targets = self.get_conditioning(batch)
+        if targets.dim() == 4:
+            targets = targets.argmax(dim=1)
+        if t is None:
+            t = torch.randint(0, self.diffusion_model.num_timesteps, (x.shape[0],), device=self.device).long()
+        else:
+            t = torch.full(size=(x.shape[0],), fill_value=t, device=self.device).long()
+        x_noisy = self.get_x_noisy(x, t)
+        logits = self(x_noisy, t)
+        loss = F.cross_entropy(logits, targets, reduction='none')
+        self.write_logs(loss.detach(), logits.detach(), targets.detach())
+        loss = loss.mean()
+        return loss, logits, x_noisy, targets
+    def training_step(self, batch, batch_idx):
+        loss, *_ = self.shared_step(batch)
+        return loss
+    def reset_noise_accs(self):
+        self.noisy_acc = {t: {'acc@1': [], 'acc@5': []} for t in
+                          range(0, self.diffusion_model.num_timesteps, self.diffusion_model.log_every_t)}
+    def on_validation_start(self):
+        self.reset_noise_accs()
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        loss, *_ = self.shared_step(batch)
+        for t in self.noisy_acc:
+            _, logits, _, targets = self.shared_step(batch, t)
+            self.noisy_acc[t]['acc@1'].append(self.compute_top_k(logits, targets, k=1, reduction='mean'))
+            self.noisy_acc[t]['acc@5'].append(self.compute_top_k(logits, targets, k=5, reduction='mean'))
+        return loss
+    def configure_optimizers(self):
+        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay)
+        if self.use_scheduler:
+            scheduler = instantiate_from_config(self.scheduler_config)
+            print("Setting up LambdaLR scheduler...")
+            scheduler = [
+                {
+                    'scheduler': LambdaLR(optimizer, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                }]
+            return [optimizer], scheduler
+        return optimizer
+    @torch.no_grad()
+    def log_images(self, batch, N=8, *args, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.diffusion_model.first_stage_key)
+        log['inputs'] = x
+        y = self.get_conditioning(batch)
+        if self.label_key == 'class_label':
+            y = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"])
+            log['labels'] = y
+        if ismap(y):
+            log['labels'] = self.diffusion_model.to_rgb(y)
+            for step in range(self.log_steps):
+                current_time = step * self.log_time_interval
+                _, logits, x_noisy, _ = self.shared_step(batch, t=current_time)
+                log[f'inputs@t{current_time}'] = x_noisy
+                pred = F.one_hot(logits.argmax(dim=1), num_classes=self.num_classes)
+                pred = rearrange(pred, 'b h w c -> b c h w')
+                log[f'pred@t{current_time}'] = self.diffusion_model.to_rgb(pred)
+        for key in log:
+            log[key] = log[key][:N]
+        return log