Spaces:

aka7774
/

katanuki

Running

App Files Files Community

aka7774 commited on Feb 3, 2024

Commit

dcbc4e3

verified ·

1 Parent(s): fbbd4d4

Upload 9 files

Browse files

Files changed (9) hide show

anime-seg/README.md +7 -0
anime-seg/isnetis.ckpt +3 -0
anime-seg/isnetis.onnx +3 -0
app.py +100 -0
model/__init__.py +7 -0
model/isnet.py +611 -0
model/modnet.py +667 -0
model/u2net.py +228 -0
requirements.txt +3 -0

anime-seg/README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+---
+license: apache-2.0
+---
+## Anime Segmentation Models
+models of [https://github.com/SkyTNT/anime-segmentation](https://github.com/SkyTNT/anime-segmentation)

anime-seg/isnetis.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2c8f6b9a77386c54dcdbf55b6c917108c4bdf4328abca9152c7bce5727b74d18
+size 204275908

anime-seg/isnetis.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f15622d853e8260172812b657053460e20806f04b9e05147d49af7bed31a6e99
+size 176069933

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import gradio as gr
+import os
+import torch
+import numpy as np
+from PIL import Image
+import cv2
+import pytorch_lightning as pl
+from model import ISNetDIS, ISNetGTEncoder, U2NET, U2NET_full2, U2NET_lite2, MODNet
+def get_mask(model, input_img):
+    h, w = input_img.shape[0], input_img.shape[1]
+    ph, pw = 0, 0
+    tmpImg = np.zeros([h, w, 3], dtype=np.float16)
+    tmpImg[ph // 2:ph // 2 + h, pw // 2:pw // 2 + w] = cv2.resize(input_img, (w, h)) / 255
+    tmpImg = tmpImg.transpose((2, 0, 1))
+    tmpImg = torch.from_numpy(tmpImg).unsqueeze(0).type(torch.FloatTensor).to(model.device)
+    with torch.no_grad():
+        pred = model(tmpImg)
+        pred = pred[0, :, ph // 2:ph // 2 + h, pw // 2:pw // 2 + w]
+        pred = cv2.resize(pred.cpu().numpy().transpose((1, 2, 0)), (w, h))[:, :, np.newaxis]
+        return pred
+def get_net(net_name):
+    if net_name == "isnet":
+        return ISNetDIS()
+    elif net_name == "isnet_is":
+        return ISNetDIS()
+    elif net_name == "isnet_gt":
+        return ISNetGTEncoder()
+    elif net_name == "u2net":
+        return U2NET_full2()
+    elif net_name == "u2netl":
+        return U2NET_lite2()
+    elif net_name == "modnet":
+        return MODNet()
+    raise NotImplemented
+# from anime-segmentation.train
+class AnimeSegmentation(pl.LightningModule):
+    def __init__(self, net_name):
+        super().__init__()
+        assert net_name in ["isnet_is", "isnet", "isnet_gt", "u2net", "u2netl", "modnet"]
+        self.net = get_net(net_name)
+        if net_name == "isnet_is":
+            self.gt_encoder = get_net("isnet_gt")
+            for param in self.gt_encoder.parameters():
+                param.requires_grad = False
+        else:
+            self.gt_encoder = None
+    @classmethod
+    def try_load(cls, net_name, ckpt_path, map_location=None):
+        state_dict = torch.load(ckpt_path, map_location=map_location)
+        if "epoch" in state_dict:
+            return cls.load_from_checkpoint(ckpt_path, net_name=net_name, map_location=map_location)
+        else:
+            model = cls(net_name)
+            if any([k.startswith("net.") for k, v in state_dict.items()]):
+                model.load_state_dict(state_dict)
+            else:
+                model.net.load_state_dict(state_dict)
+            return model
+    def forward(self, x):
+        if isinstance(self.net, ISNetDIS):
+            return self.net(x)[0][0].sigmoid()
+        if isinstance(self.net, ISNetGTEncoder):
+            return self.net(x)[0][0].sigmoid()
+        elif isinstance(self.net, U2NET):
+            return self.net(x)[0].sigmoid()
+        elif isinstance(self.net, MODNet):
+            return self.net(x, True)[2]
+        raise NotImplemented
+def animeseg(image):
+    if not image:
+        return None
+    model = AnimeSegmentation.try_load('isnet_is', 'anime-seg/isnetis.ckpt', 'cuda')
+    model.eval()
+    model.to('cuda')
+    img = np.array(image, dtype=np.uint8)
+    mask = get_mask(model, img)
+    img = np.concatenate((mask * img + 1 - mask, mask * 255), axis=2).astype(np.uint8)
+    return img
+with gr.Blocks() as demo:
+    title = gr.Markdown('# katanuki')
+    with gr.Row():
+        src_image = gr.Image(label="Source", sources="upload", interactive=True, type="pil")
+        dst_image = gr.Image(label="Result", interactive=False, type="numpy")
+    src_image.change(
+        fn=animeseg,
+        inputs=[src_image],
+        outputs=[dst_image],
+    )
+demo.launch()

model/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .u2net import U2NET_full
+from .u2net import U2NET_full2
+from .u2net import U2NET_lite
+from .u2net import U2NET_lite2
+from .u2net import U2NET
+from .isnet import ISNetDIS, ISNetGTEncoder
+from .modnet import MODNet

model/isnet.py ADDED Viewed

	@@ -0,0 +1,611 @@

+# Codes are borrowed from
+# https://github.com/xuebinqin/DIS/blob/main/IS-Net/models/isnet.py
+import torch
+import torch.nn as nn
+from torchvision import models
+import torch.nn.functional as F
+bce_loss = nn.BCEWithLogitsLoss(reduction="mean")
+def muti_loss_fusion(preds, target):
+    loss0 = 0.0
+    loss = 0.0
+    for i in range(0, len(preds)):
+        if preds[i].shape[2] != target.shape[2] or preds[i].shape[3] != target.shape[3]:
+            tmp_target = F.interpolate(target, size=preds[i].size()[2:], mode='bilinear', align_corners=True)
+            loss = loss + bce_loss(preds[i], tmp_target)
+        else:
+            loss = loss + bce_loss(preds[i], target)
+        if i == 0:
+            loss0 = loss
+    return loss0, loss
+fea_loss = nn.MSELoss(reduction="mean")
+kl_loss = nn.KLDivLoss(reduction="mean")
+l1_loss = nn.L1Loss(reduction="mean")
+smooth_l1_loss = nn.SmoothL1Loss(reduction="mean")
+def muti_loss_fusion_kl(preds, target, dfs, fs, mode='MSE'):
+    loss0 = 0.0
+    loss = 0.0
+    for i in range(0, len(preds)):
+        if preds[i].shape[2] != target.shape[2] or preds[i].shape[3] != target.shape[3]:
+            tmp_target = F.interpolate(target, size=preds[i].size()[2:], mode='bilinear', align_corners=True)
+            loss = loss + bce_loss(preds[i], tmp_target)
+        else:
+            loss = loss + bce_loss(preds[i], target)
+        if i == 0:
+            loss0 = loss
+    for i in range(0, len(dfs)):
+        df = dfs[i]
+        fs_i = fs[i]
+        if mode == 'MSE':
+            loss = loss + fea_loss(df, fs_i)  ### add the mse loss of features as additional constraints
+        elif mode == 'KL':
+            loss = loss + kl_loss(F.log_softmax(df, dim=1), F.softmax(fs_i, dim=1))
+        elif mode == 'MAE':
+            loss = loss + l1_loss(df, fs_i)
+        elif mode == 'SmoothL1':
+            loss = loss + smooth_l1_loss(df, fs_i)
+    return loss0, loss
+class REBNCONV(nn.Module):
+    def __init__(self, in_ch=3, out_ch=3, dirate=1, stride=1):
+        super(REBNCONV, self).__init__()
+        self.conv_s1 = nn.Conv2d(in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate, stride=stride)
+        self.bn_s1 = nn.BatchNorm2d(out_ch)
+        self.relu_s1 = nn.ReLU(inplace=True)
+    def forward(self, x):
+        hx = x
+        xout = self.relu_s1(self.bn_s1(self.conv_s1(hx)))
+        return xout
+## upsample tensor 'src' to have the same spatial size with tensor 'tar'
+def _upsample_like(src, tar):
+    src = F.interpolate(src, size=tar.shape[2:], mode='bilinear', align_corners=False)
+    return src
+### RSU-7 ###
+class RSU7(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3, img_size=512):
+        super(RSU7, self).__init__()
+        self.in_ch = in_ch
+        self.mid_ch = mid_ch
+        self.out_ch = out_ch
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)  ## 1 -> 1/2
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx = self.pool5(hx5)
+        hx6 = self.rebnconv6(hx)
+        hx7 = self.rebnconv7(hx6)
+        hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1))
+        hx6dup = _upsample_like(hx6d, hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-6 ###
+class RSU6(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU6, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx = self.pool4(hx4)
+        hx5 = self.rebnconv5(hx)
+        hx6 = self.rebnconv6(hx5)
+        hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-5 ###
+class RSU5(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU5, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx = self.pool3(hx3)
+        hx4 = self.rebnconv4(hx)
+        hx5 = self.rebnconv5(hx4)
+        hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-4 ###
+class RSU4(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx = self.pool1(hx1)
+        hx2 = self.rebnconv2(hx)
+        hx = self.pool2(hx2)
+        hx3 = self.rebnconv3(hx)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1))
+        return hx1d + hxin
+### RSU-4F ###
+class RSU4F(nn.Module):
+    def __init__(self, in_ch=3, mid_ch=12, out_ch=3):
+        super(RSU4F, self).__init__()
+        self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1)
+        self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1)
+        self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2)
+        self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4)
+        self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8)
+        self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4)
+        self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2)
+        self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1)
+    def forward(self, x):
+        hx = x
+        hxin = self.rebnconvin(hx)
+        hx1 = self.rebnconv1(hxin)
+        hx2 = self.rebnconv2(hx1)
+        hx3 = self.rebnconv3(hx2)
+        hx4 = self.rebnconv4(hx3)
+        hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1))
+        hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1))
+        hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1))
+        return hx1d + hxin
+class myrebnconv(nn.Module):
+    def __init__(self, in_ch=3,
+                 out_ch=1,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 dilation=1,
+                 groups=1):
+        super(myrebnconv, self).__init__()
+        self.conv = nn.Conv2d(in_ch,
+                              out_ch,
+                              kernel_size=kernel_size,
+                              stride=stride,
+                              padding=padding,
+                              dilation=dilation,
+                              groups=groups)
+        self.bn = nn.BatchNorm2d(out_ch)
+        self.rl = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.rl(self.bn(self.conv(x)))
+class ISNetGTEncoder(nn.Module):
+    def __init__(self, in_ch=1, out_ch=1):
+        super(ISNetGTEncoder, self).__init__()
+        self.conv_in = myrebnconv(in_ch, 16, 3, stride=2, padding=1)  # nn.Conv2d(in_ch,64,3,stride=2,padding=1)
+        self.stage1 = RSU7(16, 16, 64)
+        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage2 = RSU6(64, 16, 64)
+        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage3 = RSU5(64, 32, 128)
+        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage4 = RSU4(128, 32, 256)
+        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage5 = RSU4F(256, 64, 512)
+        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage6 = RSU4F(512, 64, 512)
+        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
+        self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
+        self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
+    @staticmethod
+    def compute_loss(args):
+        preds, targets = args
+        return muti_loss_fusion(preds, targets)
+    def forward(self, x):
+        hx = x
+        hxin = self.conv_in(hx)
+        # hx = self.pool_in(hxin)
+        # stage 1
+        hx1 = self.stage1(hxin)
+        hx = self.pool12(hx1)
+        # stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+        # stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+        # stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+        # stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+        # stage 6
+        hx6 = self.stage6(hx)
+        # side output
+        d1 = self.side1(hx1)
+        d1 = _upsample_like(d1, x)
+        d2 = self.side2(hx2)
+        d2 = _upsample_like(d2, x)
+        d3 = self.side3(hx3)
+        d3 = _upsample_like(d3, x)
+        d4 = self.side4(hx4)
+        d4 = _upsample_like(d4, x)
+        d5 = self.side5(hx5)
+        d5 = _upsample_like(d5, x)
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, x)
+        # d0 = self.outconv(torch.cat((d1,d2,d3,d4,d5,d6),1))
+        # return [torch.sigmoid(d1), torch.sigmoid(d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid(d5), torch.sigmoid(d6)], [hx1, hx2, hx3, hx4, hx5, hx6]
+        return [d1, d2, d3, d4, d5, d6], [hx1, hx2, hx3, hx4, hx5, hx6]
+class ISNetDIS(nn.Module):
+    def __init__(self, in_ch=3, out_ch=1):
+        super(ISNetDIS, self).__init__()
+        self.conv_in = nn.Conv2d(in_ch, 64, 3, stride=2, padding=1)
+        self.pool_in = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage1 = RSU7(64, 32, 64)
+        self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage2 = RSU6(64, 32, 128)
+        self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage3 = RSU5(128, 64, 256)
+        self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage4 = RSU4(256, 128, 512)
+        self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage5 = RSU4F(512, 256, 512)
+        self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True)
+        self.stage6 = RSU4F(512, 256, 512)
+        # decoder
+        self.stage5d = RSU4F(1024, 256, 512)
+        self.stage4d = RSU4(1024, 128, 256)
+        self.stage3d = RSU5(512, 64, 128)
+        self.stage2d = RSU6(256, 32, 64)
+        self.stage1d = RSU7(128, 16, 64)
+        self.side1 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side2 = nn.Conv2d(64, out_ch, 3, padding=1)
+        self.side3 = nn.Conv2d(128, out_ch, 3, padding=1)
+        self.side4 = nn.Conv2d(256, out_ch, 3, padding=1)
+        self.side5 = nn.Conv2d(512, out_ch, 3, padding=1)
+        self.side6 = nn.Conv2d(512, out_ch, 3, padding=1)
+        # self.outconv = nn.Conv2d(6*out_ch,out_ch,1)
+    @staticmethod
+    def compute_loss_kl(preds, targets, dfs, fs, mode='MSE'):
+        return muti_loss_fusion_kl(preds, targets, dfs, fs, mode=mode)
+    @staticmethod
+    def compute_loss(args):
+        if len(args) == 3:
+            ds, dfs, labels = args
+            return muti_loss_fusion(ds, labels)
+        else:
+            ds, dfs, labels, fs = args
+            return muti_loss_fusion_kl(ds, labels, dfs, fs, mode="MSE")
+    def forward(self, x):
+        hx = x
+        hxin = self.conv_in(hx)
+        hx = self.pool_in(hxin)
+        # stage 1
+        hx1 = self.stage1(hxin)
+        hx = self.pool12(hx1)
+        # stage 2
+        hx2 = self.stage2(hx)
+        hx = self.pool23(hx2)
+        # stage 3
+        hx3 = self.stage3(hx)
+        hx = self.pool34(hx3)
+        # stage 4
+        hx4 = self.stage4(hx)
+        hx = self.pool45(hx4)
+        # stage 5
+        hx5 = self.stage5(hx)
+        hx = self.pool56(hx5)
+        # stage 6
+        hx6 = self.stage6(hx)
+        hx6up = _upsample_like(hx6, hx5)
+        # -------------------- decoder --------------------
+        hx5d = self.stage5d(torch.cat((hx6up, hx5), 1))
+        hx5dup = _upsample_like(hx5d, hx4)
+        hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1))
+        hx4dup = _upsample_like(hx4d, hx3)
+        hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1))
+        hx3dup = _upsample_like(hx3d, hx2)
+        hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1))
+        hx2dup = _upsample_like(hx2d, hx1)
+        hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1))
+        # side output
+        d1 = self.side1(hx1d)
+        d1 = _upsample_like(d1, x)
+        d2 = self.side2(hx2d)
+        d2 = _upsample_like(d2, x)
+        d3 = self.side3(hx3d)
+        d3 = _upsample_like(d3, x)
+        d4 = self.side4(hx4d)
+        d4 = _upsample_like(d4, x)
+        d5 = self.side5(hx5d)
+        d5 = _upsample_like(d5, x)
+        d6 = self.side6(hx6)
+        d6 = _upsample_like(d6, x)
+        # d0 = self.outconv(torch.cat((d1,d2,d3,d4,d5,d6),1))
+        # return [torch.sigmoid(d1), torch.sigmoid(d2), torch.sigmoid(d3), torch.sigmoid(d4), torch.sigmoid(d5), torch.sigmoid(d6)], [hx1d, hx2d, hx3d, hx4d, hx5d, hx6]
+        return [d1, d2, d3, d4, d5, d6], [hx1d, hx2d, hx3d, hx4d, hx5d, hx6]

model/modnet.py ADDED Viewed

	@@ -0,0 +1,667 @@

+# Codes are borrowed from
+# https://github.com/ZHKKKe/MODNet/blob/master/src/trainer.py
+# https://github.com/ZHKKKe/MODNet/blob/master/src/models/backbones/mobilenetv2.py
+# https://github.com/ZHKKKe/MODNet/blob/master/src/models/modnet.py
+import numpy as np
+import scipy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import math
+import torch
+from scipy.ndimage import gaussian_filter
+# ----------------------------------------------------------------------------------
+# Loss Functions
+# ----------------------------------------------------------------------------------
+class GaussianBlurLayer(nn.Module):
+    """ Add Gaussian Blur to a 4D tensors
+    This layer takes a 4D tensor of {N, C, H, W} as input.
+    The Gaussian blur will be performed in given channel number (C) splitly.
+    """
+    def __init__(self, channels, kernel_size):
+        """
+        Arguments:
+            channels (int): Channel for input tensor
+            kernel_size (int): Size of the kernel used in blurring
+        """
+        super(GaussianBlurLayer, self).__init__()
+        self.channels = channels
+        self.kernel_size = kernel_size
+        assert self.kernel_size % 2 != 0
+        self.op = nn.Sequential(
+            nn.ReflectionPad2d(math.floor(self.kernel_size / 2)),
+            nn.Conv2d(channels, channels, self.kernel_size,
+                      stride=1, padding=0, bias=None, groups=channels)
+        )
+        self._init_kernel()
+    def forward(self, x):
+        """
+        Arguments:
+            x (torch.Tensor): input 4D tensor
+        Returns:
+            torch.Tensor: Blurred version of the input
+        """
+        if not len(list(x.shape)) == 4:
+            print('\'GaussianBlurLayer\' requires a 4D tensor as input\n')
+            exit()
+        elif not x.shape[1] == self.channels:
+            print('In \'GaussianBlurLayer\', the required channel ({0}) is'
+                  'not the same as input ({1})\n'.format(self.channels, x.shape[1]))
+            exit()
+        return self.op(x)
+    def _init_kernel(self):
+        sigma = 0.3 * ((self.kernel_size - 1) * 0.5 - 1) + 0.8
+        n = np.zeros((self.kernel_size, self.kernel_size))
+        i = math.floor(self.kernel_size / 2)
+        n[i, i] = 1
+        kernel = gaussian_filter(n, sigma)
+        for name, param in self.named_parameters():
+            param.data.copy_(torch.from_numpy(kernel))
+            param.requires_grad = False
+blurer = GaussianBlurLayer(1, 3)
+def loss_func(pred_semantic, pred_detail, pred_matte, image, trimap, gt_matte,
+              semantic_scale=10.0, detail_scale=10.0, matte_scale=1.0):
+    """ loss of MODNet
+    Arguments:
+        blurer: GaussianBlurLayer
+        pred_semantic: model output
+        pred_detail: model output
+        pred_matte: model output
+        image : input RGB image ts pixel values should be normalized
+        trimap : trimap used to calculate the losses
+                its pixel values can be 0, 0.5, or 1
+                (foreground=1, background=0, unknown=0.5)
+        gt_matte: ground truth alpha matte its pixel values are between [0, 1]
+        semantic_scale (float): scale of the semantic loss
+                                NOTE: please adjust according to your dataset
+        detail_scale (float): scale of the detail loss
+                              NOTE: please adjust according to your dataset
+        matte_scale (float): scale of the matte loss
+                             NOTE: please adjust according to your dataset
+    Returns:
+        semantic_loss (torch.Tensor): loss of the semantic estimation [Low-Resolution (LR) Branch]
+        detail_loss (torch.Tensor): loss of the detail prediction [High-Resolution (HR) Branch]
+        matte_loss (torch.Tensor): loss of the semantic-detail fusion [Fusion Branch]
+    """
+    trimap = trimap.float()
+    # calculate the boundary mask from the trimap
+    boundaries = (trimap < 0.5) + (trimap > 0.5)
+    # calculate the semantic loss
+    gt_semantic = F.interpolate(gt_matte, scale_factor=1 / 16, mode='bilinear')
+    gt_semantic = blurer(gt_semantic)
+    semantic_loss = torch.mean(F.mse_loss(pred_semantic, gt_semantic))
+    semantic_loss = semantic_scale * semantic_loss
+    # calculate the detail loss
+    pred_boundary_detail = torch.where(boundaries, trimap, pred_detail.float())
+    gt_detail = torch.where(boundaries, trimap, gt_matte.float())
+    detail_loss = torch.mean(F.l1_loss(pred_boundary_detail, gt_detail.float()))
+    detail_loss = detail_scale * detail_loss
+    # calculate the matte loss
+    pred_boundary_matte = torch.where(boundaries, trimap, pred_matte.float())
+    matte_l1_loss = F.l1_loss(pred_matte, gt_matte) + 4.0 * F.l1_loss(pred_boundary_matte, gt_matte)
+    matte_compositional_loss = F.l1_loss(image * pred_matte, image * gt_matte) \
+                               + 4.0 * F.l1_loss(image * pred_boundary_matte, image * gt_matte)
+    matte_loss = torch.mean(matte_l1_loss + matte_compositional_loss)
+    matte_loss = matte_scale * matte_loss
+    return semantic_loss, detail_loss, matte_loss
+# ------------------------------------------------------------------------------
+#  Useful functions
+# ------------------------------------------------------------------------------
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+# ------------------------------------------------------------------------------
+#  Class of Inverted Residual block
+# ------------------------------------------------------------------------------
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expansion, dilation=1):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = round(inp * expansion)
+        self.use_res_connect = self.stride == 1 and inp == oup
+        if expansion == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, dilation=dilation, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, dilation=dilation, bias=False),
+                nn.BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+            )
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+# ------------------------------------------------------------------------------
+#  Class of MobileNetV2
+# ------------------------------------------------------------------------------
+class MobileNetV2(nn.Module):
+    def __init__(self, in_channels, alpha=1.0, expansion=6, num_classes=1000):
+        super(MobileNetV2, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [expansion, 24, 2, 2],
+            [expansion, 32, 3, 2],
+            [expansion, 64, 4, 2],
+            [expansion, 96, 3, 1],
+            [expansion, 160, 3, 2],
+            [expansion, 320, 1, 1],
+        ]
+        # building first layer
+        input_channel = _make_divisible(input_channel * alpha, 8)
+        self.last_channel = _make_divisible(last_channel * alpha, 8) if alpha > 1.0 else last_channel
+        self.features = [conv_bn(self.in_channels, input_channel, 2)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = _make_divisible(int(c * alpha), 8)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(InvertedResidual(input_channel, output_channel, s, expansion=t))
+                else:
+                    self.features.append(InvertedResidual(input_channel, output_channel, 1, expansion=t))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+        # building classifier
+        if self.num_classes is not None:
+            self.classifier = nn.Sequential(
+                nn.Dropout(0.2),
+                nn.Linear(self.last_channel, num_classes),
+            )
+        # Initialize weights
+        self._init_weights()
+    def forward(self, x):
+        # Stage1
+        x = self.features[0](x)
+        x = self.features[1](x)
+        # Stage2
+        x = self.features[2](x)
+        x = self.features[3](x)
+        # Stage3
+        x = self.features[4](x)
+        x = self.features[5](x)
+        x = self.features[6](x)
+        # Stage4
+        x = self.features[7](x)
+        x = self.features[8](x)
+        x = self.features[9](x)
+        x = self.features[10](x)
+        x = self.features[11](x)
+        x = self.features[12](x)
+        x = self.features[13](x)
+        # Stage5
+        x = self.features[14](x)
+        x = self.features[15](x)
+        x = self.features[16](x)
+        x = self.features[17](x)
+        x = self.features[18](x)
+        # Classification
+        if self.num_classes is not None:
+            x = x.mean(dim=(2, 3))
+            x = self.classifier(x)
+        # Output
+        return x
+    def _load_pretrained_model(self, pretrained_file):
+        pretrain_dict = torch.load(pretrained_file, map_location='cpu')
+        model_dict = {}
+        state_dict = self.state_dict()
+        print("[MobileNetV2] Loading pretrained model...")
+        for k, v in pretrain_dict.items():
+            if k in state_dict:
+                model_dict[k] = v
+            else:
+                print(k, "is ignored")
+        state_dict.update(model_dict)
+        self.load_state_dict(state_dict)
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+class BaseBackbone(nn.Module):
+    """ Superclass of Replaceable Backbone Model for Semantic Estimation
+    """
+    def __init__(self, in_channels):
+        super(BaseBackbone, self).__init__()
+        self.in_channels = in_channels
+        self.model = None
+        self.enc_channels = []
+    def forward(self, x):
+        raise NotImplementedError
+    def load_pretrained_ckpt(self):
+        raise NotImplementedError
+class MobileNetV2Backbone(BaseBackbone):
+    """ MobileNetV2 Backbone
+    """
+    def __init__(self, in_channels):
+        super(MobileNetV2Backbone, self).__init__(in_channels)
+        self.model = MobileNetV2(self.in_channels, alpha=1.0, expansion=6, num_classes=None)
+        self.enc_channels = [16, 24, 32, 96, 1280]
+    def forward(self, x):
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(0, 2)), x)
+        x = self.model.features[0](x)
+        x = self.model.features[1](x)
+        enc2x = x
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(2, 4)), x)
+        x = self.model.features[2](x)
+        x = self.model.features[3](x)
+        enc4x = x
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(4, 7)), x)
+        x = self.model.features[4](x)
+        x = self.model.features[5](x)
+        x = self.model.features[6](x)
+        enc8x = x
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(7, 14)), x)
+        x = self.model.features[7](x)
+        x = self.model.features[8](x)
+        x = self.model.features[9](x)
+        x = self.model.features[10](x)
+        x = self.model.features[11](x)
+        x = self.model.features[12](x)
+        x = self.model.features[13](x)
+        enc16x = x
+        # x = reduce(lambda x, n: self.model.features[n](x), list(range(14, 19)), x)
+        x = self.model.features[14](x)
+        x = self.model.features[15](x)
+        x = self.model.features[16](x)
+        x = self.model.features[17](x)
+        x = self.model.features[18](x)
+        enc32x = x
+        return [enc2x, enc4x, enc8x, enc16x, enc32x]
+    def load_pretrained_ckpt(self):
+        # the pre-trained model is provided by https://github.com/thuyngch/Human-Segmentation-PyTorch
+        ckpt_path = './pretrained/mobilenetv2_human_seg.ckpt'
+        if not os.path.exists(ckpt_path):
+            print('cannot find the pretrained mobilenetv2 backbone')
+            exit()
+        ckpt = torch.load(ckpt_path)
+        self.model.load_state_dict(ckpt)
+SUPPORTED_BACKBONES = {
+    'mobilenetv2': MobileNetV2Backbone,
+}
+# ------------------------------------------------------------------------------
+#  MODNet Basic Modules
+# ------------------------------------------------------------------------------
+class IBNorm(nn.Module):
+    """ Combine Instance Norm and Batch Norm into One Layer
+    """
+    def __init__(self, in_channels):
+        super(IBNorm, self).__init__()
+        in_channels = in_channels
+        self.bnorm_channels = int(in_channels / 2)
+        self.inorm_channels = in_channels - self.bnorm_channels
+        self.bnorm = nn.BatchNorm2d(self.bnorm_channels, affine=True)
+        self.inorm = nn.InstanceNorm2d(self.inorm_channels, affine=False)
+    def forward(self, x):
+        bn_x = self.bnorm(x[:, :self.bnorm_channels, ...].contiguous())
+        in_x = self.inorm(x[:, self.bnorm_channels:, ...].contiguous())
+        return torch.cat((bn_x, in_x), 1)
+class Conv2dIBNormRelu(nn.Module):
+    """ Convolution + IBNorm + ReLu
+    """
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, groups=1, bias=True,
+                 with_ibn=True, with_relu=True):
+        super(Conv2dIBNormRelu, self).__init__()
+        layers = [
+            nn.Conv2d(in_channels, out_channels, kernel_size,
+                      stride=stride, padding=padding, dilation=dilation,
+                      groups=groups, bias=bias)
+        ]
+        if with_ibn:
+            layers.append(IBNorm(out_channels))
+        if with_relu:
+            layers.append(nn.ReLU(inplace=True))
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class SEBlock(nn.Module):
+    """ SE Block Proposed in https://arxiv.org/pdf/1709.01507.pdf
+    """
+    def __init__(self, in_channels, out_channels, reduction=1):
+        super(SEBlock, self).__init__()
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, int(in_channels // reduction), bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(int(in_channels // reduction), out_channels, bias=False),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        w = self.pool(x).view(b, c)
+        w = self.fc(w).view(b, c, 1, 1)
+        return x * w.expand_as(x)
+# ------------------------------------------------------------------------------
+#  MODNet Branches
+# ------------------------------------------------------------------------------
+class LRBranch(nn.Module):
+    """ Low Resolution Branch of MODNet
+    """
+    def __init__(self, backbone):
+        super(LRBranch, self).__init__()
+        enc_channels = backbone.enc_channels
+        self.backbone = backbone
+        self.se_block = SEBlock(enc_channels[4], enc_channels[4], reduction=4)
+        self.conv_lr16x = Conv2dIBNormRelu(enc_channels[4], enc_channels[3], 5, stride=1, padding=2)
+        self.conv_lr8x = Conv2dIBNormRelu(enc_channels[3], enc_channels[2], 5, stride=1, padding=2)
+        self.conv_lr = Conv2dIBNormRelu(enc_channels[2], 1, kernel_size=3, stride=2, padding=1, with_ibn=False,
+                                        with_relu=False)
+    def forward(self, img, inference):
+        enc_features = self.backbone.forward(img)
+        enc2x, enc4x, enc32x = enc_features[0], enc_features[1], enc_features[4]
+        enc32x = self.se_block(enc32x)
+        lr16x = F.interpolate(enc32x, scale_factor=2, mode='bilinear', align_corners=False)
+        lr16x = self.conv_lr16x(lr16x)
+        lr8x = F.interpolate(lr16x, scale_factor=2, mode='bilinear', align_corners=False)
+        lr8x = self.conv_lr8x(lr8x)
+        pred_semantic = None
+        if not inference:
+            lr = self.conv_lr(lr8x)
+            pred_semantic = torch.sigmoid(lr)
+        return pred_semantic, lr8x, [enc2x, enc4x]
+class HRBranch(nn.Module):
+    """ High Resolution Branch of MODNet
+    """
+    def __init__(self, hr_channels, enc_channels):
+        super(HRBranch, self).__init__()
+        self.tohr_enc2x = Conv2dIBNormRelu(enc_channels[0], hr_channels, 1, stride=1, padding=0)
+        self.conv_enc2x = Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=2, padding=1)
+        self.tohr_enc4x = Conv2dIBNormRelu(enc_channels[1], hr_channels, 1, stride=1, padding=0)
+        self.conv_enc4x = Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1)
+        self.conv_hr4x = nn.Sequential(
+            Conv2dIBNormRelu(3 * hr_channels + 3, 2 * hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1),
+        )
+        self.conv_hr2x = nn.Sequential(
+            Conv2dIBNormRelu(2 * hr_channels, 2 * hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(hr_channels, hr_channels, 3, stride=1, padding=1),
+        )
+        self.conv_hr = nn.Sequential(
+            Conv2dIBNormRelu(hr_channels + 3, hr_channels, 3, stride=1, padding=1),
+            Conv2dIBNormRelu(hr_channels, 1, kernel_size=1, stride=1, padding=0, with_ibn=False, with_relu=False),
+        )
+    def forward(self, img, enc2x, enc4x, lr8x, inference):
+        img2x = F.interpolate(img, scale_factor=1 / 2, mode='bilinear', align_corners=False)
+        img4x = F.interpolate(img, scale_factor=1 / 4, mode='bilinear', align_corners=False)
+        enc2x = self.tohr_enc2x(enc2x)
+        hr4x = self.conv_enc2x(torch.cat((img2x, enc2x), dim=1))
+        enc4x = self.tohr_enc4x(enc4x)
+        hr4x = self.conv_enc4x(torch.cat((hr4x, enc4x), dim=1))
+        lr4x = F.interpolate(lr8x, scale_factor=2, mode='bilinear', align_corners=False)
+        hr4x = self.conv_hr4x(torch.cat((hr4x, lr4x, img4x), dim=1))
+        hr2x = F.interpolate(hr4x, scale_factor=2, mode='bilinear', align_corners=False)
+        hr2x = self.conv_hr2x(torch.cat((hr2x, enc2x), dim=1))
+        pred_detail = None
+        if not inference:
+            hr = F.interpolate(hr2x, scale_factor=2, mode='bilinear', align_corners=False)
+            hr = self.conv_hr(torch.cat((hr, img), dim=1))
+            pred_detail = torch.sigmoid(hr)
+        return pred_detail, hr2x
+class FusionBranch(nn.Module):
+    """ Fusion Branch of MODNet
+    """
+    def __init__(self, hr_channels, enc_channels):
+        super(FusionBranch, self).__init__()
+        self.conv_lr4x = Conv2dIBNormRelu(enc_channels[2], hr_channels, 5, stride=1, padding=2)
+        self.conv_f2x = Conv2dIBNormRelu(2 * hr_channels, hr_channels, 3, stride=1, padding=1)
+        self.conv_f = nn.Sequential(
+            Conv2dIBNormRelu(hr_channels + 3, int(hr_channels / 2), 3, stride=1, padding=1),
+            Conv2dIBNormRelu(int(hr_channels / 2), 1, 1, stride=1, padding=0, with_ibn=False, with_relu=False),
+        )
+    def forward(self, img, lr8x, hr2x):
+        lr4x = F.interpolate(lr8x, scale_factor=2, mode='bilinear', align_corners=False)
+        lr4x = self.conv_lr4x(lr4x)
+        lr2x = F.interpolate(lr4x, scale_factor=2, mode='bilinear', align_corners=False)
+        f2x = self.conv_f2x(torch.cat((lr2x, hr2x), dim=1))
+        f = F.interpolate(f2x, scale_factor=2, mode='bilinear', align_corners=False)
+        f = self.conv_f(torch.cat((f, img), dim=1))
+        pred_matte = torch.sigmoid(f)
+        return pred_matte
+# ------------------------------------------------------------------------------
+#  MODNet
+# ------------------------------------------------------------------------------
+class MODNet(nn.Module):
+    """ Architecture of MODNet
+    """
+    def __init__(self, in_channels=3, hr_channels=32, backbone_arch='mobilenetv2', backbone_pretrained=False):
+        super(MODNet, self).__init__()
+        self.in_channels = in_channels
+        self.hr_channels = hr_channels
+        self.backbone_arch = backbone_arch
+        self.backbone_pretrained = backbone_pretrained
+        self.backbone = SUPPORTED_BACKBONES[self.backbone_arch](self.in_channels)
+        self.lr_branch = LRBranch(self.backbone)
+        self.hr_branch = HRBranch(self.hr_channels, self.backbone.enc_channels)
+        self.f_branch = FusionBranch(self.hr_channels, self.backbone.enc_channels)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                self._init_conv(m)
+            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.InstanceNorm2d):
+                self._init_norm(m)
+        if self.backbone_pretrained:
+            self.backbone.load_pretrained_ckpt()
+    def forward(self, img, inference):
+        pred_semantic, lr8x, [enc2x, enc4x] = self.lr_branch(img, inference)
+        pred_detail, hr2x = self.hr_branch(img, enc2x, enc4x, lr8x, inference)
+        pred_matte = self.f_branch(img, lr8x, hr2x)
+        return pred_semantic, pred_detail, pred_matte
+    @staticmethod
+    def compute_loss(args):
+        pred_semantic, pred_detail, pred_matte, image, trimap, gt_matte = args
+        semantic_loss, detail_loss, matte_loss = loss_func(pred_semantic, pred_detail, pred_matte,
+                                                           image, trimap, gt_matte)
+        loss = semantic_loss + detail_loss + matte_loss
+        return matte_loss, loss
+    def freeze_norm(self):
+        norm_types = [nn.BatchNorm2d, nn.InstanceNorm2d]
+        for m in self.modules():
+            for n in norm_types:
+                if isinstance(m, n):
+                    m.eval()
+                    continue
+    def _init_conv(self, conv):
+        nn.init.kaiming_uniform_(
+            conv.weight, a=0, mode='fan_in', nonlinearity='relu')
+        if conv.bias is not None:
+            nn.init.constant_(conv.bias, 0)
+    def _init_norm(self, norm):
+        if norm.weight is not None:
+            nn.init.constant_(norm.weight, 1)
+            nn.init.constant_(norm.bias, 0)
+    def _apply(self, fn):
+        super(MODNet, self)._apply(fn)
+        blurer._apply(fn)  # let blurer's device same as modnet
+        return self

model/u2net.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Codes are borrowed from
+# https://github.com/xuebinqin/U-2-Net/blob/master/model/u2net_refactor.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+__all__ = ['U2NET_full', 'U2NET_full2', 'U2NET_lite', 'U2NET_lite2', "U2NET"]
+bce_loss = nn.BCEWithLogitsLoss(reduction='mean')
+def _upsample_like(x, size):
+    return F.interpolate(x, size=size, mode='bilinear', align_corners=False)
+def _size_map(x, height):
+    # {height: size} for Upsample
+    size = list(x.shape[-2:])
+    sizes = {}
+    for h in range(1, height):
+        sizes[h] = size
+        size = [math.ceil(w / 2) for w in size]
+    return sizes
+class REBNCONV(nn.Module):
+    def __init__(self, in_ch=3, out_ch=3, dilate=1):
+        super(REBNCONV, self).__init__()
+        self.conv_s1 = nn.Conv2d(in_ch, out_ch, 3, padding=1 * dilate, dilation=1 * dilate)
+        self.bn_s1 = nn.BatchNorm2d(out_ch)
+        self.relu_s1 = nn.ReLU(inplace=True)
+    def forward(self, x):
+        return self.relu_s1(self.bn_s1(self.conv_s1(x)))
+class RSU(nn.Module):
+    def __init__(self, name, height, in_ch, mid_ch, out_ch, dilated=False):
+        super(RSU, self).__init__()
+        self.name = name
+        self.height = height
+        self.dilated = dilated
+        self._make_layers(height, in_ch, mid_ch, out_ch, dilated)
+    def forward(self, x):
+        sizes = _size_map(x, self.height)
+        x = self.rebnconvin(x)
+        # U-Net like symmetric encoder-decoder structure
+        def unet(x, height=1):
+            if height < self.height:
+                x1 = getattr(self, f'rebnconv{height}')(x)
+                if not self.dilated and height < self.height - 1:
+                    x2 = unet(getattr(self, 'downsample')(x1), height + 1)
+                else:
+                    x2 = unet(x1, height + 1)
+                x = getattr(self, f'rebnconv{height}d')(torch.cat((x2, x1), 1))
+                return _upsample_like(x, sizes[height - 1]) if not self.dilated and height > 1 else x
+            else:
+                return getattr(self, f'rebnconv{height}')(x)
+        return x + unet(x)
+    def _make_layers(self, height, in_ch, mid_ch, out_ch, dilated=False):
+        self.add_module('rebnconvin', REBNCONV(in_ch, out_ch))
+        self.add_module('downsample', nn.MaxPool2d(2, stride=2, ceil_mode=True))
+        self.add_module(f'rebnconv1', REBNCONV(out_ch, mid_ch))
+        self.add_module(f'rebnconv1d', REBNCONV(mid_ch * 2, out_ch))
+        for i in range(2, height):
+            dilate = 1 if not dilated else 2 ** (i - 1)
+            self.add_module(f'rebnconv{i}', REBNCONV(mid_ch, mid_ch, dilate=dilate))
+            self.add_module(f'rebnconv{i}d', REBNCONV(mid_ch * 2, mid_ch, dilate=dilate))
+        dilate = 2 if not dilated else 2 ** (height - 1)
+        self.add_module(f'rebnconv{height}', REBNCONV(mid_ch, mid_ch, dilate=dilate))
+class U2NET(nn.Module):
+    def __init__(self, cfgs, out_ch):
+        super(U2NET, self).__init__()
+        self.out_ch = out_ch
+        self._make_layers(cfgs)
+    def forward(self, x):
+        sizes = _size_map(x, self.height)
+        maps = []  # storage for maps
+        # side saliency map
+        def unet(x, height=1):
+            if height < 6:
+                x1 = getattr(self, f'stage{height}')(x)
+                x2 = unet(getattr(self, 'downsample')(x1), height + 1)
+                x = getattr(self, f'stage{height}d')(torch.cat((x2, x1), 1))
+                side(x, height)
+                return _upsample_like(x, sizes[height - 1]) if height > 1 else x
+            else:
+                x = getattr(self, f'stage{height}')(x)
+                side(x, height)
+                return _upsample_like(x, sizes[height - 1])
+        def side(x, h):
+            # side output saliency map (before sigmoid)
+            x = getattr(self, f'side{h}')(x)
+            x = _upsample_like(x, sizes[1])
+            maps.append(x)
+        def fuse():
+            # fuse saliency probability maps
+            maps.reverse()
+            x = torch.cat(maps, 1)
+            x = getattr(self, 'outconv')(x)
+            maps.insert(0, x)
+            # return [torch.sigmoid(x) for x in maps]
+            return [x for x in maps]
+        unet(x)
+        maps = fuse()
+        return maps
+    @staticmethod
+    def compute_loss(args):
+        preds, labels_v = args
+        d0, d1, d2, d3, d4, d5, d6 = preds
+        loss0 = bce_loss(d0, labels_v)
+        loss1 = bce_loss(d1, labels_v)
+        loss2 = bce_loss(d2, labels_v)
+        loss3 = bce_loss(d3, labels_v)
+        loss4 = bce_loss(d4, labels_v)
+        loss5 = bce_loss(d5, labels_v)
+        loss6 = bce_loss(d6, labels_v)
+        loss = loss0 + loss1 + loss2 + loss3 + loss4 + loss5 + loss6
+        return loss0, loss
+    def _make_layers(self, cfgs):
+        self.height = int((len(cfgs) + 1) / 2)
+        self.add_module('downsample', nn.MaxPool2d(2, stride=2, ceil_mode=True))
+        for k, v in cfgs.items():
+            # build rsu block
+            self.add_module(k, RSU(v[0], *v[1]))
+            if v[2] > 0:
+                # build side layer
+                self.add_module(f'side{v[0][-1]}', nn.Conv2d(v[2], self.out_ch, 3, padding=1))
+        # build fuse layer
+        self.add_module('outconv', nn.Conv2d(int(self.height * self.out_ch), self.out_ch, 1))
+def U2NET_full():
+    full = {
+        # cfgs for building RSUs and sides
+        # {stage : [name, (height(L), in_ch, mid_ch, out_ch, dilated), side]}
+        'stage1': ['En_1', (7, 3, 32, 64), -1],
+        'stage2': ['En_2', (6, 64, 32, 128), -1],
+        'stage3': ['En_3', (5, 128, 64, 256), -1],
+        'stage4': ['En_4', (4, 256, 128, 512), -1],
+        'stage5': ['En_5', (4, 512, 256, 512, True), -1],
+        'stage6': ['En_6', (4, 512, 256, 512, True), 512],
+        'stage5d': ['De_5', (4, 1024, 256, 512, True), 512],
+        'stage4d': ['De_4', (4, 1024, 128, 256), 256],
+        'stage3d': ['De_3', (5, 512, 64, 128), 128],
+        'stage2d': ['De_2', (6, 256, 32, 64), 64],
+        'stage1d': ['De_1', (7, 128, 16, 64), 64],
+    }
+    return U2NET(cfgs=full, out_ch=1)
+def U2NET_full2():
+    full = {
+        # cfgs for building RSUs and sides
+        # {stage : [name, (height(L), in_ch, mid_ch, out_ch, dilated), side]}
+        'stage1': ['En_1', (8, 3, 32, 64), -1],
+        'stage2': ['En_2', (7, 64, 32, 128), -1],
+        'stage3': ['En_3', (6, 128, 64, 256), -1],
+        'stage4': ['En_4', (5, 256, 128, 512), -1],
+        'stage5': ['En_5', (5, 512, 256, 512, True), -1],
+        'stage6': ['En_6', (5, 512, 256, 512, True), 512],
+        'stage5d': ['De_5', (5, 1024, 256, 512, True), 512],
+        'stage4d': ['De_4', (5, 1024, 128, 256), 256],
+        'stage3d': ['De_3', (6, 512, 64, 128), 128],
+        'stage2d': ['De_2', (7, 256, 32, 64), 64],
+        'stage1d': ['De_1', (8, 128, 16, 64), 64],
+    }
+    return U2NET(cfgs=full, out_ch=1)
+def U2NET_lite():
+    lite = {
+        # cfgs for building RSUs and sides
+        # {stage : [name, (height(L), in_ch, mid_ch, out_ch, dilated), side]}
+        'stage1': ['En_1', (7, 3, 16, 64), -1],
+        'stage2': ['En_2', (6, 64, 16, 64), -1],
+        'stage3': ['En_3', (5, 64, 16, 64), -1],
+        'stage4': ['En_4', (4, 64, 16, 64), -1],
+        'stage5': ['En_5', (4, 64, 16, 64, True), -1],
+        'stage6': ['En_6', (4, 64, 16, 64, True), 64],
+        'stage5d': ['De_5', (4, 128, 16, 64, True), 64],
+        'stage4d': ['De_4', (4, 128, 16, 64), 64],
+        'stage3d': ['De_3', (5, 128, 16, 64), 64],
+        'stage2d': ['De_2', (6, 128, 16, 64), 64],
+        'stage1d': ['De_1', (7, 128, 16, 64), 64],
+    }
+    return U2NET(cfgs=lite, out_ch=1)
+def U2NET_lite2():
+    lite = {
+        # cfgs for building RSUs and sides
+        # {stage : [name, (height(L), in_ch, mid_ch, out_ch, dilated), side]}
+        'stage1': ['En_1', (8, 3, 16, 64), -1],
+        'stage2': ['En_2', (7, 64, 16, 64), -1],
+        'stage3': ['En_3', (6, 64, 16, 64), -1],
+        'stage4': ['En_4', (5, 64, 16, 64), -1],
+        'stage5': ['En_5', (5, 64, 16, 64, True), -1],
+        'stage6': ['En_6', (5, 64, 16, 64, True), 64],
+        'stage5d': ['De_5', (5, 128, 16, 64, True), 64],
+        'stage4d': ['De_4', (5, 128, 16, 64), 64],
+        'stage3d': ['De_3', (6, 128, 16, 64), 64],
+        'stage2d': ['De_2', (7, 128, 16, 64), 64],
+        'stage1d': ['De_1', (8, 128, 16, 64), 64],
+    }
+    return U2NET(cfgs=lite, out_ch=1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+opencv-python
+pytorch_lightning
+torchvision