Spaces:

aifactory
/

namedmask

Runtime error

App Files Files Community

NoelShin commited on Oct 1, 2022

Commit

82d5d16

1 Parent(s): b8f8de8

Add application file

Browse files

Files changed (24) hide show

.DS_Store +0 -0
.idea/vcs.xml +6 -0
.idea/workspace.xml +57 -0
README.md +5 -5
app.py +96 -0
description.html +19 -0
images/2007_002260.jpg +0 -0
images/2008_002536.jpg +0 -0
images/2008_003499.jpg +0 -0
images/2008_007814.jpg +0 -0
images/2009_004801.jpg +0 -0
images/2010_001079.jpg +0 -0
images/2010_005063.jpg +0 -0
networks/__init__.py +2 -0
networks/_deeplab.py +190 -0
networks/backbone/__init__.py +3 -0
networks/backbone/hrnetv2.py +330 -0
networks/backbone/mobilenetv2.py +188 -0
networks/backbone/resnet.py +335 -0
networks/modeling.py +181 -0
networks/utils.py +90 -0
requirements.txt +5 -0
utils.py +59 -0
voc_val_n500_cp2_ex.yaml +50 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,57 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="9fb9e207-fc4f-4ff3-9adc-3c4c1e67daa7" name="Changes" comment="">
+      <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="ProjectId" id="2FJJUIvRiY0OO5Dz2zvs0pNxkhb" />
+  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+    <option name="showMembers" value="true" />
+  </component>
+  <component name="PropertiesComponent">
+    <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
+    <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
+    <property name="WebServerToolWindowFactoryState" value="false" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
+    <property name="node.js.detected.package.eslint" value="true" />
+    <property name="node.js.detected.package.tslint" value="true" />
+    <property name="node.js.selected.package.eslint" value="(autodetect)" />
+    <property name="node.js.selected.package.tslint" value="(autodetect)" />
+    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
+  </component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$" />
+    </key>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="9fb9e207-fc4f-4ff3-9adc-3c4c1e67daa7" name="Changes" comment="" />
+      <created>1664204268713</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1664204268713</updated>
+      <workItem from="1664204270261" duration="37000" />
+      <workItem from="1664204316867" duration="4389000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+</project>

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Namedmask
-emoji: 🌍
-colorFrom: red
-colorTo: indigo
 sdk: gradio
-sdk_version: 3.4
 app_file: app.py
 pinned: false
 license: mit

 ---
+title: namedmask
+emoji: 😷
+colorFrom: gray
+colorTo: gray
 sdk: gradio
+sdk_version: 2.9.0
 app_file: app.py
 pinned: false
 license: mit

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from argparse import ArgumentParser, Namespace
+from typing import Dict, List, Tuple
+import codecs
+import yaml
+import numpy as np
+import cv2
+from PIL import Image
+import torch
+import torch.nn.functional as F
+from torchvision.transforms.functional import to_tensor, normalize, resize
+import gradio as gr
+from utils import get_network, colourise_mask
+import os
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# state_dict: dict = torch.hub.load_state_dict_from_url(
+#     "https://www.robots.ox.ac.uk/~vgg/research/namedmask/shared_files/voc2012/namedmask_voc2012.pt",
+#     map_location=device  # "cuda" if torch.cuda.is_available() else "cpu"
+# )["model"]
+parser = ArgumentParser("NamedMask demo")
+parser.add_argument(
+    "--config",
+    type=str,
+    default="voc_val_n500_cp2_ex.yaml"
+)
+args: Namespace = parser.parse_args()
+base_args = yaml.safe_load(open(f"{args.config}", 'r'))
+base_args.pop("dataset_name")
+args: dict = vars(args)
+args.update(base_args)
+args: Namespace = Namespace(**args)
+model = get_network().to(device)
+# model.load_state_dict(state_dict)
+model.eval()
+size: int = 384
+max_size: int = 512
+mean: Tuple[float, float, float] = (0.485, 0.456, 0.406)
+std: Tuple[float, float, float] = (0.229, 0.224, 0.225)
+@torch.no_grad()
+def main(image: Image):
+    pil_image: Image.Image = resize(image, size=size, max_size=max_size)
+    image: torch.Tensor = normalize(to_tensor(pil_image), mean=list(mean), std=list(std))  # 3 x H x W
+    # logits: b (=1) x n_categories x H x W, torch.float32
+    logits: torch.Tensor = model(image[None].to(device))
+    # pred: H x W
+    pred: torch.Tensor = logits.squeeze(dim=0).argmax(dim=0).cpu().numpy()
+    coloured_pred: np.ndarray = colourise_mask(mask=pred.cpu().numpy())
+    super_imposed_img = cv2.addWeighted(coloured_pred, 0.5, np.array(pil_image), 0.5, 0)
+    # resize prediction to original resolution
+    # note: upsampling by 4 and cutting the padded region allows for a better result
+    # H, W = image.shape[-2:]
+    #
+    # # iterate over batch dimension
+    # pred_mask: np.ndarray = (pred_mask > 0.5).cpu().numpy().astype(np.uint8) * 255
+    #
+    # pred_mask_bi: np.ndarray = np.clip(pred_mask_bi, 0, 255).astype(np.uint8)
+    #
+    # attn_map = cv2.cvtColor(cv2.applyColorMap(pred_mask_bi, cv2.COLORMAP_VIRIDIS), cv2.COLOR_BGR2RGB)
+    # super_imposed_img = cv2.addWeighted(attn_map, 0.5, np.array(pil_image), 0.5, 0)
+    return super_imposed_img
+demo = gr.Interface(
+    fn=main,
+    inputs=gr.inputs.Image(type="pil", source="upload", tool="editor"),
+    outputs=gr.outputs.Image(type="numpy", label="prediction"),  # "image",
+    examples=[f"images/{fname}.jpg" for fname in [
+        "2007_002260",
+        "2008_002536",
+        "2008_003499",
+        "2008_007814",
+        "2009_004801",
+        "2010_001079",
+        "2010_005063"
+    ]],
+    examples_per_page=10,
+    description=codecs.open("description.html", 'r', "utf-8").read(),
+    title="NamedMask: Distilling Segmenters from Complementary Foundation Models",
+    allow_flagging="never",
+    analytics_enabled=False
+)
+demo.launch(
+    # share=True
+)

description.html ADDED Viewed

	@@ -0,0 +1,19 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Title</title>
+</head>
+<body>
+    This is a demo of <a href="https://arxiv.org/pdf/2209.11228.pdf">NamedMask: Distilling Segmenters from Complementary Foundation Models</a>.</br>
+    The goal of this work is to segment and name regions of images without access to pixel-level labels during training.
+    To tackle this task, we construct segmenters by distilling the complementary strengths of two foundation models.
+    The first, CLIP (Radford et al. 2021), exhibits the ability to assign names to image content but lacks an accessible representation of object structure.
+    The second, DINO (Caron et al. 2021), captures the spatial extent of objects but has no knowledge of object names.
+    Our method, termed NamedMask, begins by using CLIP to construct category-specific archives of images.
+    These images are pseudo-labelled with a category-agnostic salient object detector bootstrapped from DINO, then refined by category-specific segmenters using the CLIP archive labels.
+    Thanks to the high quality of the refined masks, we show that a standard segmentation architecture trained on these archives with appropriate data augmentation achieves impressive semantic segmentation abilities for both single-object and multi-object images.
+    As a result, our proposed NamedMask performs favourably against a range of prior work on five benchmarks including the VOC2012, COCO and large-scale ImageNet-S datasets.
+    Code is publicly available at <a href="https://github.com/NoelShin/namedmask">our repo</a>.
+</body>
+</html>

images/2007_002260.jpg ADDED Viewed

images/2008_002536.jpg ADDED Viewed

images/2008_003499.jpg ADDED Viewed

images/2008_007814.jpg ADDED Viewed

images/2009_004801.jpg ADDED Viewed

images/2010_001079.jpg ADDED Viewed

images/2010_005063.jpg ADDED Viewed

networks/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from networks.modeling import *
2	+ from networks._deeplab import convert_to_separable_conv, set_bn_momentum

networks/_deeplab.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from networks.deeplab.utils import _SimpleSegmentationModel
+__all__ = ["DeepLabV3"]
+class DeepLabV3(_SimpleSegmentationModel):
+    """
+    Implements DeepLabV3 model from
+    `"Rethinking Atrous Convolution for Semantic Image Segmentation"
+    <https://arxiv.org/abs/1706.05587>`_.
+    Arguments:
+        backbone (nn.Module): the network used to compute the features for the model.
+            The backbone should return an OrderedDict[Tensor], with the key being
+            "out" for the last feature map used, and "aux" if an auxiliary classifier
+            is used.
+        classifier (nn.Module): module that takes the "out" element returned from
+            the backbone and returns a dense prediction.
+        aux_classifier (nn.Module, optional): auxiliary classifier used during training
+    """
+    pass
+class DeepLabHeadV3Plus(nn.Module):
+    def __init__(self, in_channels, low_level_channels, num_classes, aspp_dilate=[12, 24, 36]):
+        super(DeepLabHeadV3Plus, self).__init__()
+        self.project = nn.Sequential(
+            nn.Conv2d(low_level_channels, 48, 1, bias=False),
+            nn.BatchNorm2d(48),
+            nn.ReLU(inplace=True),
+        )
+        self.aspp = ASPP(in_channels, aspp_dilate)
+        self.classifier = nn.Sequential(
+            nn.Conv2d(304, 256, 3, padding=1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, num_classes, 1)
+        )
+        self._init_weight()
+    def forward(self, feature):
+        low_level_feature = self.project(feature['low_level'])
+        output_feature = self.aspp(feature['out'])
+        output_feature = F.interpolate(output_feature, size=low_level_feature.shape[2:], mode='bilinear',
+                                       align_corners=False)
+        return self.classifier(torch.cat([low_level_feature, output_feature], dim=1))
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+class DeepLabHead(nn.Module):
+    def __init__(self, in_channels, num_classes, aspp_dilate=[12, 24, 36]):
+        super(DeepLabHead, self).__init__()
+        self.classifier = nn.Sequential(
+            ASPP(in_channels, aspp_dilate),
+            nn.Conv2d(256, 256, 3, padding=1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, num_classes, 1)
+        )
+        self._init_weight()
+    def forward(self, feature):
+        return self.classifier(feature['out'])
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+class AtrousSeparableConvolution(nn.Module):
+    """ Atrous Separable Convolution
+    """
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, bias=True):
+        super(AtrousSeparableConvolution, self).__init__()
+        self.body = nn.Sequential(
+            # Separable Conv
+            nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, stride=stride, padding=padding,
+                      dilation=dilation, bias=bias, groups=in_channels),
+            # PointWise Conv
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=bias),
+        )
+        self._init_weight()
+    def forward(self, x):
+        return self.body(x)
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+class ASPPConv(nn.Sequential):
+    def __init__(self, in_channels, out_channels, dilation):
+        modules = [
+            nn.Conv2d(in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True)
+        ]
+        super(ASPPConv, self).__init__(*modules)
+class ASPPPooling(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        super(ASPPPooling, self).__init__(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True))
+    def forward(self, x):
+        size = x.shape[-2:]
+        x = super(ASPPPooling, self).forward(x)
+        return F.interpolate(x, size=size, mode='bilinear', align_corners=False)
+class ASPP(nn.Module):
+    def __init__(self, in_channels, atrous_rates):
+        super(ASPP, self).__init__()
+        out_channels = 256
+        modules = []
+        modules.append(nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True)))
+        rate1, rate2, rate3 = tuple(atrous_rates)
+        modules.append(ASPPConv(in_channels, out_channels, rate1))
+        modules.append(ASPPConv(in_channels, out_channels, rate2))
+        modules.append(ASPPConv(in_channels, out_channels, rate3))
+        modules.append(ASPPPooling(in_channels, out_channels))
+        self.convs = nn.ModuleList(modules)
+        self.project = nn.Sequential(
+            nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Dropout(0.1), )
+    def forward(self, x):
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res = torch.cat(res, dim=1)
+        return self.project(res)
+def convert_to_separable_conv(module):
+    new_module = module
+    if isinstance(module, nn.Conv2d) and module.kernel_size[0] > 1:
+        new_module = AtrousSeparableConvolution(module.in_channels,
+                                                module.out_channels,
+                                                module.kernel_size,
+                                                module.stride,
+                                                module.padding,
+                                                module.dilation,
+                                                module.bias)
+    for name, child in module.named_children():
+        new_module.add_module(name, convert_to_separable_conv(child))
+    return new_module
+def set_bn_momentum(model, momentum=0.1):
+    for m in model.modules():
+        if isinstance(m, nn.BatchNorm2d):
+            m.momentum = momentum

networks/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from networks.deeplab.backbone import resnet
+from networks.deeplab.backbone import mobilenetv2
+from networks.deeplab.backbone import hrnetv2

networks/backbone/hrnetv2.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+import os
+__all__ = ['HRNet', 'hrnetv2_48', 'hrnetv2_32']
+# Checkpoint path of pre-trained backbone (edit to your path). Download backbone pretrained model hrnetv2-32 @
+# https://drive.google.com/file/d/1NxCK7Zgn5PmeS7W1jYLt5J9E0RRZ2oyF/view?usp=sharing .Personally, I added the backbone
+# weights to the folder /checkpoints
+try:
+    CKPT_PATH = './checkpoints/hrnetv2_32_model_best_epoch96.pth'
+    print(f"Backbone HRNet Pretrained weights at: {CKPT_PATH}, only usable for HRNetv2-32")
+except:
+    print("No backbone checkpoint found for HRNetv2, please set pretrained=False when calling model")
+# HRNetv2-48 not available yet, but you can train the whole model from scratch.
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class StageModule(nn.Module):
+    def __init__(self, stage, output_branches, c):
+        super(StageModule, self).__init__()
+        self.number_of_branches = stage  # number of branches is equivalent to the stage configuration.
+        self.output_branches = output_branches
+        self.branches = nn.ModuleList()
+        # Note: Resolution + Number of channels maintains the same throughout respective branch.
+        for i in range(self.number_of_branches):  # Stage scales with the number of branches. Ex: Stage 2 -> 2 branch
+            channels = c * (2 ** i)  # Scale channels by 2x for branch with lower resolution,
+            # Paper does x4 basic block for each forward sequence in each branch (x4 basic block considered as a block)
+            branch = nn.Sequential(*[BasicBlock(channels, channels) for _ in range(4)])
+            self.branches.append(branch)  # list containing all forward sequence of individual branches.
+        # For each branch requires repeated fusion with all other branches after passing through x4 basic blocks.
+        self.fuse_layers = nn.ModuleList()
+        for branch_output_number in range(self.output_branches):
+            self.fuse_layers.append(nn.ModuleList())
+            for branch_number in range(self.number_of_branches):
+                if branch_number == branch_output_number:
+                    self.fuse_layers[-1].append(nn.Sequential())  # Used in place of "None" because it is callable
+                elif branch_number > branch_output_number:
+                    self.fuse_layers[-1].append(nn.Sequential(
+                        nn.Conv2d(c * (2 ** branch_number), c * (2 ** branch_output_number), kernel_size=1, stride=1,
+                                  bias=False),
+                        nn.BatchNorm2d(c * (2 ** branch_output_number), eps=1e-05, momentum=0.1, affine=True,
+                                       track_running_stats=True),
+                        nn.Upsample(scale_factor=(2.0 ** (branch_number - branch_output_number)), mode='nearest'),
+                    ))
+                elif branch_number < branch_output_number:
+                    downsampling_fusion = []
+                    for _ in range(branch_output_number - branch_number - 1):
+                        downsampling_fusion.append(nn.Sequential(
+                            nn.Conv2d(c * (2 ** branch_number), c * (2 ** branch_number), kernel_size=3, stride=2,
+                                      padding=1,
+                                      bias=False),
+                            nn.BatchNorm2d(c * (2 ** branch_number), eps=1e-05, momentum=0.1, affine=True,
+                                           track_running_stats=True),
+                            nn.ReLU(inplace=True),
+                        ))
+                    downsampling_fusion.append(nn.Sequential(
+                        nn.Conv2d(c * (2 ** branch_number), c * (2 ** branch_output_number), kernel_size=3,
+                                  stride=2, padding=1,
+                                  bias=False),
+                        nn.BatchNorm2d(c * (2 ** branch_output_number), eps=1e-05, momentum=0.1, affine=True,
+                                       track_running_stats=True),
+                    ))
+                    self.fuse_layers[-1].append(nn.Sequential(*downsampling_fusion))
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        # input to each stage is a list of inputs for each branch
+        x = [branch(branch_input) for branch, branch_input in zip(self.branches, x)]
+        x_fused = []
+        for branch_output_index in range(
+                self.output_branches):  # Amount of output branches == total length of fusion layers
+            for input_index in range(self.number_of_branches):  # The inputs of other branches to be fused.
+                if input_index == 0:
+                    x_fused.append(self.fuse_layers[branch_output_index][input_index](x[input_index]))
+                else:
+                    x_fused[branch_output_index] = x_fused[branch_output_index] + self.fuse_layers[branch_output_index][
+                        input_index](x[input_index])
+        # After fusing all streams together, you will need to pass the fused layers
+        for i in range(self.output_branches):
+            x_fused[i] = self.relu(x_fused[i])
+        return x_fused  # returning a list of fused outputs
+class HRNet(nn.Module):
+    def __init__(self, c=48, num_blocks=[1, 4, 3], num_classes=1000):
+        super(HRNet, self).__init__()
+        # Stem:
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64, eps=1e-05, affine=True, track_running_stats=True)
+        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(64, eps=1e-05, affine=True, track_running_stats=True)
+        self.relu = nn.ReLU(inplace=True)
+        # Stage 1:
+        downsample = nn.Sequential(
+            nn.Conv2d(64, 256, kernel_size=1, stride=1, bias=False),
+            nn.BatchNorm2d(256, eps=1e-05, affine=True, track_running_stats=True),
+        )
+        # Note that bottleneck module will expand the output channels according to the output channels*block.expansion
+        bn_expansion = Bottleneck.expansion  # The channel expansion is set in the bottleneck class.
+        self.layer1 = nn.Sequential(
+            Bottleneck(64, 64, downsample=downsample),  # Input is 64 for first module connection
+            Bottleneck(bn_expansion * 64, 64),
+            Bottleneck(bn_expansion * 64, 64),
+            Bottleneck(bn_expansion * 64, 64),
+        )
+        # Transition 1 - Creation of the first two branches (one full and one half resolution)
+        # Need to transition into high resolution stream and mid resolution stream
+        self.transition1 = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(256, c, kernel_size=3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(c, eps=1e-05, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            ),
+            nn.Sequential(nn.Sequential(  # Double Sequential to fit with official pretrained weights
+                nn.Conv2d(256, c * 2, kernel_size=3, stride=2, padding=1, bias=False),
+                nn.BatchNorm2d(c * 2, eps=1e-05, affine=True, track_running_stats=True),
+                nn.ReLU(inplace=True),
+            )),
+        ])
+        # Stage 2:
+        number_blocks_stage2 = num_blocks[0]
+        self.stage2 = nn.Sequential(
+            *[StageModule(stage=2, output_branches=2, c=c) for _ in range(number_blocks_stage2)])
+        # Transition 2  - Creation of the third branch (1/4 resolution)
+        self.transition2 = self._make_transition_layers(c, transition_number=2)
+        # Stage 3:
+        number_blocks_stage3 = num_blocks[1]  # number blocks you want to create before fusion
+        self.stage3 = nn.Sequential(
+            *[StageModule(stage=3, output_branches=3, c=c) for _ in range(number_blocks_stage3)])
+        # Transition  - Creation of the fourth branch (1/8 resolution)
+        self.transition3 = self._make_transition_layers(c, transition_number=3)
+        # Stage 4:
+        number_blocks_stage4 = num_blocks[2]  # number blocks you want to create before fusion
+        self.stage4 = nn.Sequential(
+            *[StageModule(stage=4, output_branches=4, c=c) for _ in range(number_blocks_stage4)])
+        # Classifier (extra module if want to use for classification):
+        # pool, reduce dimensionality, flatten, connect to linear layer for classification:
+        out_channels = sum([c * 2 ** i for i in range(len(num_blocks)+1)])  # total output channels of HRNetV2
+        pool_feature_map = 8
+        self.bn_classifier = nn.Sequential(
+            nn.Conv2d(out_channels, out_channels // 4, kernel_size=1, bias=False),
+            nn.BatchNorm2d(out_channels // 4, eps=1e-05, affine=True, track_running_stats=True),
+            nn.ReLU(inplace=True),
+            nn.AdaptiveAvgPool2d(pool_feature_map),
+            nn.Flatten(),
+            nn.Linear(pool_feature_map * pool_feature_map * (out_channels // 4), num_classes),
+        )
+    @staticmethod
+    def _make_transition_layers(c, transition_number):
+        return nn.Sequential(
+            nn.Conv2d(c * (2 ** (transition_number - 1)), c * (2 ** transition_number), kernel_size=3, stride=2,
+                      padding=1, bias=False),
+            nn.BatchNorm2d(c * (2 ** transition_number), eps=1e-05, affine=True,
+                           track_running_stats=True),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        # Stem:
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        # Stage 1
+        x = self.layer1(x)
+        x = [trans(x) for trans in self.transition1]  # split to 2 branches, form a list.
+        # Stage 2
+        x = self.stage2(x)
+        x.append(self.transition2(x[-1]))
+        # Stage 3
+        x = self.stage3(x)
+        x.append(self.transition3(x[-1]))
+        # Stage 4
+        x = self.stage4(x)
+        # HRNetV2 Example: (follow paper, upsample via bilinear interpolation and to highest resolution size)
+        output_h, output_w = x[0].size(2), x[0].size(3)  # Upsample to size of highest resolution stream
+        x1 = F.interpolate(x[1], size=(output_h, output_w), mode='bilinear', align_corners=False)
+        x2 = F.interpolate(x[2], size=(output_h, output_w), mode='bilinear', align_corners=False)
+        x3 = F.interpolate(x[3], size=(output_h, output_w), mode='bilinear', align_corners=False)
+        # Upsampling all the other resolution streams and then concatenate all (rather than adding/fusing like HRNetV1)
+        x = torch.cat([x[0], x1, x2, x3], dim=1)
+        x = self.bn_classifier(x)
+        return x
+def _hrnet(arch, channels, num_blocks, pretrained, progress, **kwargs):
+    model = HRNet(channels, num_blocks, **kwargs)
+    if pretrained:
+        print("Loading pretrained backbone HRNetV2 model .....")
+        checkpoint = torch.load(CKPT_PATH)
+        model.load_state_dict(checkpoint['state_dict'])
+    return model
+def hrnetv2_48(pretrained=False, progress=True, number_blocks=[1, 4, 3], **kwargs):
+    w_channels = 48
+    return _hrnet('hrnetv2_48', w_channels, number_blocks, pretrained, progress,
+                  **kwargs)
+def hrnetv2_32(pretrained=False, progress=True, number_blocks=[1, 4, 3], **kwargs):
+    w_channels = 32
+    return _hrnet('hrnetv2_32', w_channels, number_blocks, pretrained, progress,
+                  **kwargs)
+if __name__ == '__main__':
+    try:
+        CKPT_PATH = os.path.join(os.path.abspath("."), '../../checkpoints/hrnetv2_32_model_best_epoch96.pth')
+        print("--- Running file as MAIN ---")
+        print(f"Backbone HRNET Pretrained weights as __main__ at: {CKPT_PATH}")
+    except:
+        print("No backbone checkpoint found for HRNetv2, please set pretrained=False when calling model")
+    # Models
+    model = hrnetv2_32(pretrained=True)
+    #model = hrnetv2_48(pretrained=False)
+    if torch.cuda.is_available():
+        torch.backends.cudnn.deterministic = True
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+    model.to(device)
+    in_ = torch.ones(1, 3, 768, 768).to(device)
+    y = model(in_)
+    print(y.shape)
+    # Calculate total number of parameters:
+    # pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    # print(pytorch_total_params)

networks/backbone/mobilenetv2.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from torch import nn
+try: # for torchvision<0.4
+    from torchvision.models.utils import load_state_dict_from_url
+except: # for torchvision>=0.4
+    from torch.hub import load_state_dict_from_url
+import torch.nn.functional as F
+__all__ = ['MobileNetV2', 'mobilenet_v2']
+model_urls = {
+    'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth',
+}
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    :param v:
+    :param divisor:
+    :param min_value:
+    :return:
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+class ConvBNReLU(nn.Sequential):
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, dilation=1, groups=1):
+        #padding = (kernel_size - 1) // 2
+        super(ConvBNReLU, self).__init__(
+            nn.Conv2d(in_planes, out_planes, kernel_size, stride, 0, dilation=dilation, groups=groups, bias=False),
+            nn.BatchNorm2d(out_planes),
+            nn.ReLU6(inplace=True)
+        )
+def fixed_padding(kernel_size, dilation):
+    kernel_size_effective = kernel_size + (kernel_size - 1) * (dilation - 1)
+    pad_total = kernel_size_effective - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+    return (pad_beg, pad_end, pad_beg, pad_end)
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, dilation, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = self.stride == 1 and inp == oup
+        layers = []
+        if expand_ratio != 1:
+            # pw
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            # dw
+            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, dilation=dilation, groups=hidden_dim),
+            # pw-linear
+            nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+        ])
+        self.conv = nn.Sequential(*layers)
+        self.input_padding = fixed_padding( 3, dilation )
+    def forward(self, x):
+        x_pad = F.pad(x, self.input_padding)
+        if self.use_res_connect:
+            return x + self.conv(x_pad)
+        else:
+            return self.conv(x_pad)
+class MobileNetV2(nn.Module):
+    def __init__(self, num_classes=1000, output_stride=8, width_mult=1.0, inverted_residual_setting=None, round_nearest=8):
+        """
+        MobileNet V2 main class
+        Args:
+            num_classes (int): Number of classes
+            width_mult (float): Width multiplier - adjusts number of channels in each layer by this amount
+            inverted_residual_setting: Network structure
+            round_nearest (int): Round the number of channels in each layer to be a multiple of this number
+            Set to 1 to turn off rounding
+        """
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        self.output_stride = output_stride
+        current_stride = 1
+        if inverted_residual_setting is None:
+            inverted_residual_setting = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+        # only check the first element, assuming user knows t,c,n,s are required
+        if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4:
+            raise ValueError("inverted_residual_setting should be non-empty "
+                             "or a 4-element list, got {}".format(inverted_residual_setting))
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.last_channel = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+        features = [ConvBNReLU(3, input_channel, stride=2)]
+        current_stride *= 2
+        dilation=1
+        previous_dilation = 1
+        # building inverted residual blocks
+        for t, c, n, s in inverted_residual_setting:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            previous_dilation = dilation
+            if current_stride == output_stride:
+                stride = 1
+                dilation *= s
+            else:
+                stride = s
+                current_stride *= s
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i==0:
+                    features.append(block(input_channel, output_channel, stride, previous_dilation, expand_ratio=t))
+                else:
+                    features.append(block(input_channel, output_channel, 1, dilation, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*features)
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, num_classes),
+        )
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.zeros_(m.bias)
+    def forward(self, x):
+        x = self.features(x)
+        x = x.mean([2, 3])
+        x = self.classifier(x)
+        return x
+def mobilenet_v2(pretrained=False, progress=True, **kwargs):
+    """
+    Constructs a MobileNetV2 architecture from
+    `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" <https://arxiv.org/abs/1801.04381>`_.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    model = MobileNetV2(**kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls['mobilenet_v2'],
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model

networks/backbone/resnet.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import torch
+import torch.nn as nn
+try: # for torchvision<0.4
+    from torchvision.models.utils import load_state_dict_from_url
+except: # for torchvision>=0.4
+    from torch.hub import load_state_dict_from_url
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
+           'wide_resnet50_2', 'wide_resnet101_2']
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+        return x
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch],
+                                              progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+def resnet18(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+def resnet34(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+def resnet50(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+def resnet101(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
+                   **kwargs)
+def resnet152(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
+                   **kwargs)
+def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)

networks/modeling.py ADDED Viewed

	@@ -0,0 +1,181 @@

+from networks.deeplab.utils import IntermediateLayerGetter
+from networks.deeplab._deeplab import DeepLabHead, DeepLabHeadV3Plus, DeepLabV3
+from networks.deeplab.backbone import resnet, mobilenetv2, hrnetv2
+def _segm_hrnet(name, backbone_name, num_classes, pretrained_backbone):
+    backbone = hrnetv2.__dict__[backbone_name](pretrained_backbone)
+    # HRNetV2 config:
+    # the final output channels is dependent on highest resolution channel config (c).
+    # output of backbone will be the inplanes to assp:
+    hrnet_channels = int(backbone_name.split('_')[-1])
+    inplanes = sum([hrnet_channels * 2 ** i for i in range(4)])
+    low_level_planes = 256  # all hrnet version channel output from bottleneck is the same
+    aspp_dilate = [12, 24, 36]  # If follow paper trend, can put [24, 48, 72].
+    if name == 'deeplabv3plus':
+        return_layers = {'stage4': 'out', 'layer1': 'low_level'}
+        classifier = DeepLabHeadV3Plus(inplanes, low_level_planes, num_classes, aspp_dilate)
+    elif name == 'deeplabv3':
+        return_layers = {'stage4': 'out'}
+        classifier = DeepLabHead(inplanes, num_classes, aspp_dilate)
+    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers, hrnet_flag=True)
+    model = DeepLabV3(backbone, classifier)
+    return model
+def _segm_resnet(name, backbone_name, num_classes, output_stride, pretrained_backbone):
+    if output_stride == 8:
+        replace_stride_with_dilation = [False, True, True]
+        aspp_dilate = [12, 24, 36]
+    else:
+        replace_stride_with_dilation = [False, False, True]
+        aspp_dilate = [6, 12, 18]
+    backbone = resnet.__dict__[backbone_name](
+        pretrained=pretrained_backbone,
+        replace_stride_with_dilation=replace_stride_with_dilation)
+    inplanes = 2048
+    low_level_planes = 256
+    if name == 'deeplabv3plus':
+        return_layers = {'layer4': 'out', 'layer1': 'low_level'}
+        classifier = DeepLabHeadV3Plus(inplanes, low_level_planes, num_classes, aspp_dilate)
+    elif name == 'deeplabv3':
+        return_layers = {'layer4': 'out'}
+        classifier = DeepLabHead(inplanes, num_classes, aspp_dilate)
+    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
+    model = DeepLabV3(backbone, classifier)
+    return model
+def _segm_mobilenet(name, backbone_name, num_classes, output_stride, pretrained_backbone):
+    if output_stride == 8:
+        aspp_dilate = [12, 24, 36]
+    else:
+        aspp_dilate = [6, 12, 18]
+    backbone = mobilenetv2.mobilenet_v2(pretrained=pretrained_backbone, output_stride=output_stride)
+    # rename layers
+    backbone.low_level_features = backbone.features[0:4]
+    backbone.high_level_features = backbone.features[4:-1]
+    backbone.features = None
+    backbone.classifier = None
+    inplanes = 320
+    low_level_planes = 24
+    if name == 'deeplabv3plus':
+        return_layers = {'high_level_features': 'out', 'low_level_features': 'low_level'}
+        classifier = DeepLabHeadV3Plus(inplanes, low_level_planes, num_classes, aspp_dilate)
+    elif name == 'deeplabv3':
+        return_layers = {'high_level_features': 'out'}
+        classifier = DeepLabHead(inplanes, num_classes, aspp_dilate)
+    backbone = IntermediateLayerGetter(backbone, return_layers=return_layers)
+    model = DeepLabV3(backbone, classifier)
+    return model
+def _load_model(arch_type, backbone, num_classes, output_stride, pretrained_backbone):
+    if backbone == 'mobilenetv2':
+        model = _segm_mobilenet(arch_type, backbone, num_classes, output_stride=output_stride,
+                                pretrained_backbone=pretrained_backbone)
+    elif backbone.startswith('resnet'):
+        model = _segm_resnet(arch_type, backbone, num_classes, output_stride=output_stride,
+                             pretrained_backbone=pretrained_backbone)
+    elif backbone.startswith('hrnetv2'):
+        model = _segm_hrnet(arch_type, backbone, num_classes, pretrained_backbone=pretrained_backbone)
+    else:
+        raise NotImplementedError
+    return model
+# Deeplab v3
+def deeplabv3_hrnetv2_48(num_classes=21, output_stride=4, pretrained_backbone=False):  # no pretrained backbone yet
+    return _load_model('deeplabv3', 'hrnetv2_48', output_stride, num_classes, pretrained_backbone=pretrained_backbone)
+def deeplabv3_hrnetv2_32(num_classes=21, output_stride=4, pretrained_backbone=True):
+    return _load_model('deeplabv3', 'hrnetv2_32', output_stride, num_classes, pretrained_backbone=pretrained_backbone)
+def deeplabv3_resnet50(num_classes=21, output_stride=8, pretrained_backbone=True):
+    """Constructs a DeepLabV3 model with a ResNet-50 backbone.
+    Args:
+        num_classes (int): number of classes.
+        output_stride (int): output stride for deeplab.
+        pretrained_backbone (bool): If True, use the pretrained backbone.
+    """
+    return _load_model('deeplabv3', 'resnet50', num_classes, output_stride=output_stride,
+                       pretrained_backbone=pretrained_backbone)
+def deeplabv3_resnet101(num_classes=21, output_stride=8, pretrained_backbone=True):
+    """Constructs a DeepLabV3 model with a ResNet-101 backbone.
+    Args:
+        num_classes (int): number of classes.
+        output_stride (int): output stride for deeplab.
+        pretrained_backbone (bool): If True, use the pretrained backbone.
+    """
+    return _load_model('deeplabv3', 'resnet101', num_classes, output_stride=output_stride,
+                       pretrained_backbone=pretrained_backbone)
+def deeplabv3_mobilenet(num_classes=21, output_stride=8, pretrained_backbone=True, **kwargs):
+    """Constructs a DeepLabV3 model with a MobileNetv2 backbone.
+    Args:
+        num_classes (int): number of classes.
+        output_stride (int): output stride for deeplab.
+        pretrained_backbone (bool): If True, use the pretrained backbone.
+    """
+    return _load_model('deeplabv3', 'mobilenetv2', num_classes, output_stride=output_stride,
+                       pretrained_backbone=pretrained_backbone)
+# Deeplab v3+
+def deeplabv3plus_hrnetv2_48(num_classes=21, output_stride=4, pretrained_backbone=False):  # no pretrained backbone yet
+    return _load_model('deeplabv3plus', 'hrnetv2_48', num_classes, output_stride,
+                       pretrained_backbone=pretrained_backbone)
+def deeplabv3plus_hrnetv2_32(num_classes=21, output_stride=4, pretrained_backbone=True):
+    return _load_model('deeplabv3plus', 'hrnetv2_32', num_classes, output_stride,
+                       pretrained_backbone=pretrained_backbone)
+def deeplabv3plus_resnet50(num_classes=21, output_stride=8, pretrained_backbone=True):
+    """Constructs a DeepLabV3 model with a ResNet-50 backbone.
+    Args:
+        num_classes (int): number of classes.
+        output_stride (int): output stride for deeplab.
+        pretrained_backbone (bool): If True, use the pretrained backbone.
+    """
+    return _load_model('deeplabv3plus', 'resnet50', num_classes, output_stride=output_stride,
+                       pretrained_backbone=pretrained_backbone)
+def deeplabv3plus_resnet101(num_classes=21, output_stride=8, pretrained_backbone=True):
+    """Constructs a DeepLabV3+ model with a ResNet-101 backbone.
+    Args:
+        num_classes (int): number of classes.
+        output_stride (int): output stride for deeplab.
+        pretrained_backbone (bool): If True, use the pretrained backbone.
+    """
+    return _load_model('deeplabv3plus', 'resnet101', num_classes, output_stride=output_stride,
+                       pretrained_backbone=pretrained_backbone)
+def deeplabv3plus_mobilenet(num_classes=21, output_stride=8, pretrained_backbone=True):
+    """Constructs a DeepLabV3+ model with a MobileNetv2 backbone.
+    Args:
+        num_classes (int): number of classes.
+        output_stride (int): output stride for deeplab.
+        pretrained_backbone (bool): If True, use the pretrained backbone.
+    """
+    return _load_model('deeplabv3plus', 'mobilenetv2', num_classes, output_stride=output_stride,
+                       pretrained_backbone=pretrained_backbone)

networks/utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from collections import OrderedDict
+class _SimpleSegmentationModel(nn.Module):
+    def __init__(self, backbone, classifier):
+        super(_SimpleSegmentationModel, self).__init__()
+        self.backbone = backbone
+        self.classifier = classifier
+    def forward(self, x):
+        input_shape = x.shape[-2:]
+        features = self.backbone(x)
+        x = self.classifier(features)
+        x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
+        return x
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    Arguments:
+        model (nn.Module): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+    Examples::
+        >>> m = torchvision.models.resnet18(pretrained=True)
+        >>> # extract layer1 and layer3, giving as names `feat1` and feat2`
+        >>> new_m = torchvision.models._utils.IntermediateLayerGetter(m,
+        >>>     {'layer1': 'feat1', 'layer3': 'feat2'})
+        >>> out = new_m(torch.rand(1, 3, 224, 224))
+        >>> print([(k, v.shape) for k, v in out.items()])
+        >>>     [('feat1', torch.Size([1, 64, 56, 56])),
+        >>>      ('feat2', torch.Size([1, 256, 14, 14]))]
+    """
+    def __init__(self, model, return_layers, hrnet_flag=False):
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model")
+        self.hrnet_flag = hrnet_flag
+        orig_return_layers = return_layers
+        return_layers = {k: v for k, v in return_layers.items()}
+        layers = OrderedDict()
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+        super(IntermediateLayerGetter, self).__init__(layers)
+        self.return_layers = orig_return_layers
+    def forward(self, x):
+        out = OrderedDict()
+        for name, module in self.named_children():
+            if self.hrnet_flag and name.startswith('transition'):  # if using hrnet, you need to take care of transition
+                if name == 'transition1':  # in transition1, you need to split the module to two streams first
+                    x = [trans(x) for trans in module]
+                else:  # all other transition is just an extra one stream split
+                    x.append(module(x[-1]))
+            else:  # other models (ex:resnet,mobilenet) are convolutions in series.
+                x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                if name == 'stage4' and self.hrnet_flag:  # In HRNetV2, we upsample and concat all outputs streams together
+                    output_h, output_w = x[0].size(2), x[0].size(3)  # Upsample to size of highest resolution stream
+                    x1 = F.interpolate(x[1], size=(output_h, output_w), mode='bilinear', align_corners=False)
+                    x2 = F.interpolate(x[2], size=(output_h, output_w), mode='bilinear', align_corners=False)
+                    x3 = F.interpolate(x[3], size=(output_h, output_w), mode='bilinear', align_corners=False)
+                    x = torch.cat([x[0], x1, x2, x3], dim=1)
+                    out[out_name] = x
+                else:
+                    out[out_name] = x
+        return out

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+opencv-contrib-python==4.5.5.62
+torch==1.11.0
+torchvision==0.12.0
+timm==0.4.12
+scipy==1.6.2

utils.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from typing import Dict, List, Tuple, Union
+import numpy as np
+import torch
+from networks import deeplabv3plus_resnet50
+from networks import convert_to_separable_conv, set_bn_momentum
+def get_network() -> torch.nn.Module:
+    network = deeplabv3plus_resnet50(num_classes=21, pretrained_backbone=False)
+    state_dict = torch.hub.load_state_dict_from_url(
+        "https://www.robots.ox.ac.uk/~vgg/research/namedmask/shared_files/voc2012/namedmask_voc2012.pt"
+    )
+    network.backbone.load_state_dict(state_dict, strict=True)
+    convert_to_separable_conv(network.classifier)
+    set_bn_momentum(network.backbone, momentum=0.01)
+    return network
+def colourise_mask(
+        mask: np.ndarray,
+):
+    assert len(mask.shape) == 2, ValueError(mask.shape)
+    h, w = mask.shape
+    grid = np.zeros((h, w, 3), dtype=np.uint8)
+    unique_labels = set(mask.flatten())
+    voc2012_palette = {
+        0: [0, 0, 0],
+        1: [128, 0, 0],
+        2: [0, 128, 0],
+        3: [128, 128, 0],
+        4: [0, 0, 128],
+        5: [128, 0, 128],
+        6: [0, 128, 128],
+        7: [128, 128, 128],
+        8: [64, 0, 0],
+        9: [192, 0, 0],
+        10: [64, 128, 0],
+        11: [192, 128, 0],
+        12: [64, 0, 128],
+        13: [192, 0, 128],
+        14: [64, 128, 128],
+        15: [192, 128, 128],
+        16: [0, 64, 0],
+        17: [128, 64, 0],
+        18: [0, 192, 0],
+        19: [128, 192, 0],
+        20: [0, 64, 128],
+        255: [255, 255, 255]
+    }
+    for l in unique_labels:
+        grid[mask == l] = np.array(voc2012_palette[l])
+        try:
+            grid[mask == l] = np.array(voc2012_palette[l])
+        except IndexError:
+            raise IndexError(f"No colour is found for a label id: {l}")
+    return grid

voc_val_n500_cp2_ex.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+# base directories
+category_to_p_images_fp: "/home/cs-shin1/datasets/ImageNet2012/voc2012_category_to_p_images_n500.json"
+dir_ckpt: "/home/cs-shin1/namedmask/ckpt"
+dir_train_dataset: "/home/cs-shin1/datasets/ImageNet2012"
+dir_val_dataset: "/home/cs-shin1/datasets/VOCdevkit/VOC2012"
+# augmentations
+max_n_masks: 2
+scale_range: [ 0.1, 1.0 ]
+use_expert_pseudo_masks: true
+category_agnostic: false
+n_categories: 21
+categories: [
+    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "dining table",
+    "dog", "horse", "motorbike", "person", "potted plant", "sheep", "sofa", "train", "tv/monitor"
+]
+n_images: 500
+# dataset
+dataset_name: "voc2012"
+split: "val"
+train_image_size: 384
+# dataloader:
+train_dataloader_kwargs:
+  batch_size: 16
+  num_workers: 16
+  pin_memory: true
+  shuffle: true
+val_dataloader_kwargs:
+  batch_size: 1
+  num_workers: 4
+  pin_memory: true
+# Segmenter configuration
+# ["deeplabv3plus_resnet101", "deeplabv3plus_resnet50", "deeplabv3plus_mobilenet"]
+segmenter_name: "deeplabv3plus_resnet50"
+# optimiser
+lr: 0.0005
+momentum: 0.9
+weight_decay: 0.0002
+betas: [0.9, 0.999]
+n_iters: 20000
+iter_eval: 1000
+iter_log: 100