diff --git a/models/__init__.py b/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..819bdb3de4f8c479c8a0aa621e3cdda68f9bed3b --- /dev/null +++ b/models/__init__.py @@ -0,0 +1,24 @@ +from .yowo.build import build_yowo + + +def build_model(args, + d_cfg, + m_cfg, + device, + num_classes=3, + trainable=False, + resume=None): + # build action detector + if 'yowo_v2_' in args.version: + model, criterion = build_yowo( + args=args, + d_cfg=d_cfg, + m_cfg=m_cfg, + device=device, + num_classes=num_classes, + trainable=trainable, + resume=resume + ) + + return model, criterion + diff --git a/models/__pycache__/__init__.cpython-310.pyc b/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b3acd38b4ec64b1db5fb5754d28623c990b467c Binary files /dev/null and b/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/models/__pycache__/__init__.cpython-37.pyc b/models/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b20147f220b6ca2df4d7ad4cb748230bbd8efc5 Binary files /dev/null and b/models/__pycache__/__init__.cpython-37.pyc differ diff --git a/models/backbone/__init__.py b/models/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f3a5153e798ecd8df364dd3c127d639b0aa4119b --- /dev/null +++ b/models/backbone/__init__.py @@ -0,0 +1,13 @@ +from .backbone_2d.backbone_2d import Backbone2D +from .backbone_3d.backbone_3d import Backbone3D + + +def build_backbone_2d(cfg, pretrained=False): + backbone = Backbone2D(cfg, pretrained) + return backbone, backbone.feat_dims + + +def build_backbone_3d(cfg, pretrained=False): + backbone = Backbone3D(cfg, pretrained) + return backbone, backbone.feat_dim + diff --git a/models/backbone/__pycache__/__init__.cpython-310.pyc b/models/backbone/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c36c9c2b39ea12160b96ff51a4b381158902e1bb Binary files /dev/null and b/models/backbone/__pycache__/__init__.cpython-310.pyc differ diff --git a/models/backbone/__pycache__/__init__.cpython-37.pyc b/models/backbone/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c69df6694d2cccf6de591a4cf38a11588604641f Binary files /dev/null and b/models/backbone/__pycache__/__init__.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/__init__.py b/models/backbone/backbone_2d/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/backbone/backbone_2d/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_2d/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0fbfa4abfce6effb76b526810d570bf3f9938105 Binary files /dev/null and b/models/backbone/backbone_2d/__pycache__/__init__.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_2d/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..270f75632217613a7b732a068bc67ad716787e05 Binary files /dev/null and b/models/backbone/backbone_2d/__pycache__/__init__.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-310.pyc b/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5ae861007148c7c7e03e7f2b42f194cd01abe4d Binary files /dev/null and b/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-37.pyc b/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63530b430b0798e51d18cdeab17719fb96091084 Binary files /dev/null and b/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/backbone_2d.py b/models/backbone/backbone_2d/backbone_2d.py new file mode 100644 index 0000000000000000000000000000000000000000..a1bd29ab84d0ecf995408ada88f5d7d89f7e5ad0 --- /dev/null +++ b/models/backbone/backbone_2d/backbone_2d.py @@ -0,0 +1,26 @@ +import torch.nn as nn +from .cnn_2d import build_2d_cnn + + +class Backbone2D(nn.Module): + def __init__(self, cfg, pretrained=False): + super().__init__() + self.cfg = cfg + + self.backbone, self.feat_dims = build_2d_cnn(cfg, pretrained) + + + def forward(self, x): + """ + Input: + x: (Tensor) -> [B, C, H, W] + Output: + y: (List) -> [ + (Tensor) -> [B, C1, H1, W1], + (Tensor) -> [B, C2, H2, W2], + (Tensor) -> [B, C3, H3, W3] + ] + """ + feat = self.backbone(x) + + return feat diff --git a/models/backbone/backbone_2d/cnn_2d/__init__.py b/models/backbone/backbone_2d/cnn_2d/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6a154b5eae138160bf94571987cc4fa7f39e0337 --- /dev/null +++ b/models/backbone/backbone_2d/cnn_2d/__init__.py @@ -0,0 +1,18 @@ +# import 2D backbone +from .yolo_free.yolo_free import build_yolo_free + + +def build_2d_cnn(cfg, pretrained=False): + print('==============================') + print('2D Backbone: {}'.format(cfg['backbone_2d'].upper())) + print('--pretrained: {}'.format(pretrained)) + + if cfg['backbone_2d'] in ['yolo_free_nano', 'yolo_free_tiny', \ + 'yolo_free_large', 'yolo_free_huge']: + model, feat_dims = build_yolo_free(cfg['backbone_2d'], pretrained) + + else: + print('Unknown 2D Backbone ...') + exit() + + return model, feat_dims diff --git a/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7342feccce6c85a08c1e24197bfa3e5c777eeed4 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..607f066d315135751754439bf8a36382aa892ad8 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__init__.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef62940c9d0c0e25f0a1f6aa385af15922c69dbf Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e72faac2da945664156f78974b9d4167e6a2ee47 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..80119bd82a6b8e27c9760b2ce1bf8c4a6f107dc9 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d71a11cdd2518366c7fdfba9a4b3beb4c0d0f35 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..087099479daff4b314f1c063c79c816ad019e03b Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..271c9338c44a1f877c78a404c00e56c39181b920 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f38fb3ec74167e45cf1425f421d2503c23020b7 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d7ba91096fdb0f55c53098a8627034c28c18cce Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9db270964648ff57b647256636684abc37ebda38 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27e231f8ee486ae0460f887fe9e8b55eafd345f6 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52e652ad3091bba56d0a3da0b655d202846be6df Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..172e558d58bca7493c5ec5a7e2c971b3eb048be8 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f095d721ac01b5cb771de3fb84c49b02f7ac580e Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-310.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65408633fbceee510a698967fa95f2a06b4ff3d7 Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-37.pyc differ diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free.py new file mode 100644 index 0000000000000000000000000000000000000000..75128153e569a50b1d2b9ebc82f33fd5294b12d9 --- /dev/null +++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free.py @@ -0,0 +1,222 @@ +import torch +import numpy as np +import torch.nn as nn +import torch.nn.functional as F +from torch.hub import load_state_dict_from_url + +try: + from .yolo_free_backbone import build_backbone + from .yolo_free_neck import build_neck + from .yolo_free_fpn import build_fpn + from .yolo_free_head import build_head +except: + from yolo_free_backbone import build_backbone + from yolo_free_neck import build_neck + from yolo_free_fpn import build_fpn + from yolo_free_head import build_head + + +__all__ = ['build_yolo_free'] + + +model_urls = { + 'yolo_free_nano': 'https://github.com/yjh0410/FreeYOLO/releases/download/weight/yolo_free_nano_coco.pth', + 'yolo_free_tiny': 'https://github.com/yjh0410/FreeYOLO/releases/download/weight/yolo_free_tiny_coco.pth', + 'yolo_free_large': 'https://github.com/yjh0410/FreeYOLO/releases/download/weight/yolo_free_large_coco.pth', +} + + +yolo_free_config = { + 'yolo_free_nano': { + # model + 'backbone': 'shufflenetv2_1.0x', + 'pretrained': True, + 'stride': [8, 16, 32], # P3, P4, P5 + 'anchor_size': None, + # neck + 'neck': 'sppf', + 'neck_dim': 232, + 'expand_ratio': 0.5, + 'pooling_size': 5, + 'neck_act': 'lrelu', + 'neck_norm': 'BN', + 'neck_depthwise': True, + # fpn + 'fpn': 'pafpn_elan', + 'fpn_size': 'nano', + 'fpn_dim': [116, 232, 232], + 'fpn_norm': 'BN', + 'fpn_act': 'lrelu', + 'fpn_depthwise': True, + # head + 'head': 'decoupled_head', + 'head_dim': 64, + 'head_norm': 'BN', + 'head_act': 'lrelu', + 'num_cls_head': 2, + 'num_reg_head': 2, + 'head_depthwise': True, + }, + + 'yolo_free_tiny': { + # model + 'backbone': 'elannet_tiny', + 'pretrained': True, + 'stride': [8, 16, 32], # P3, P4, P5 + # neck + 'neck': 'spp_block_csp', + 'neck_dim': 256, + 'expand_ratio': 0.5, + 'pooling_size': [5, 9, 13], + 'neck_act': 'lrelu', + 'neck_norm': 'BN', + 'neck_depthwise': False, + # fpn + 'fpn': 'pafpn_elan', + 'fpn_size': 'tiny', # 'tiny', 'large', 'huge + 'fpn_dim': [128, 256, 256], + 'fpn_norm': 'BN', + 'fpn_act': 'lrelu', + 'fpn_depthwise': False, + # head + 'head': 'decoupled_head', + 'head_dim': 64, + 'head_norm': 'BN', + 'head_act': 'lrelu', + 'num_cls_head': 2, + 'num_reg_head': 2, + 'head_depthwise': False, + }, + + 'yolo_free_large': { + # model + 'backbone': 'elannet_large', + 'pretrained': True, + 'stride': [8, 16, 32], # P3, P4, P5 + # neck + 'neck': 'spp_block_csp', + 'neck_dim': 512, + 'expand_ratio': 0.5, + 'pooling_size': [5, 9, 13], + 'neck_act': 'silu', + 'neck_norm': 'BN', + 'neck_depthwise': False, + # fpn + 'fpn': 'pafpn_elan', + 'fpn_size': 'large', # 'tiny', 'large', 'huge + 'fpn_dim': [512, 1024, 512], + 'fpn_norm': 'BN', + 'fpn_act': 'silu', + 'fpn_depthwise': False, + # head + 'head': 'decoupled_head', + 'head_dim': 256, + 'head_norm': 'BN', + 'head_act': 'silu', + 'num_cls_head': 2, + 'num_reg_head': 2, + 'head_depthwise': False, + }, + +} + + +# Anchor-free YOLO +class FreeYOLO(nn.Module): + def __init__(self, cfg): + super(FreeYOLO, self).__init__() + # --------- Basic Config ----------- + self.cfg = cfg + + # --------- Network Parameters ---------- + ## backbone + self.backbone, bk_dim = build_backbone(self.cfg['backbone']) + + ## neck + self.neck = build_neck(cfg=self.cfg, in_dim=bk_dim[-1], out_dim=self.cfg['neck_dim']) + + ## fpn + self.fpn = build_fpn(cfg=self.cfg, in_dims=self.cfg['fpn_dim'], out_dim=self.cfg['head_dim']) + + ## non-shared heads + self.non_shared_heads = nn.ModuleList( + [build_head(cfg) + for _ in range(len(cfg['stride'])) + ]) + + def forward(self, x): + # backbone + feats = self.backbone(x) + + # neck + feats['layer4'] = self.neck(feats['layer4']) + + # fpn + pyramid_feats = [feats['layer2'], feats['layer3'], feats['layer4']] + pyramid_feats = self.fpn(pyramid_feats) + + # non-shared heads + all_cls_feats = [] + all_reg_feats = [] + for feat, head in zip(pyramid_feats, self.non_shared_heads): + # [B, C, H, W] + cls_feat, reg_feat = head(feat) + + all_cls_feats.append(cls_feat) + all_reg_feats.append(reg_feat) + + return all_cls_feats, all_reg_feats + + +# build FreeYOLO +def build_yolo_free(model_name='yolo_free_large', pretrained=False): + # model config + cfg = yolo_free_config[model_name] + + # FreeYOLO + model = FreeYOLO(cfg) + feat_dims = [model.cfg['head_dim']] * 3 + + # Load COCO pretrained weight + if pretrained: + url = model_urls[model_name] + + # check + if url is None: + print('No 2D pretrained weight ...') + return model, feat_dims + else: + print('Loading 2D backbone pretrained weight: {}'.format(model_name.upper())) + + # state dict + checkpoint = load_state_dict_from_url(url, map_location='cpu') + checkpoint_state_dict = checkpoint.pop('model') + + # model state dict + model_state_dict = model.state_dict() + # check + for k in list(checkpoint_state_dict.keys()): + if k in model_state_dict: + shape_model = tuple(model_state_dict[k].shape) + shape_checkpoint = tuple(checkpoint_state_dict[k].shape) + if shape_model != shape_checkpoint: + # print(k) + checkpoint_state_dict.pop(k) + else: + checkpoint_state_dict.pop(k) + # print(k) + + model.load_state_dict(checkpoint_state_dict, strict=False) + + return model, feat_dims + + +if __name__ == '__main__': + model, fpn_dim = build_yolo_free(model_name='yolo_free_nano', pretrained=True) + model.eval() + + x = torch.randn(2, 3, 64, 64) + cls_feats, reg_feats = model(x) + + for cls_feat, reg_feat in zip(cls_feats, reg_feats): + print(cls_feat.shape, reg_feat.shape) diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_backbone.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..ee3b79c770c297dcc4678c945c20a2f13483c795 --- /dev/null +++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_backbone.py @@ -0,0 +1,445 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +__all__ = ['build_backbone'] + +# ====================== ELAN-Net ========================== +# ELANNet +def get_activation(act_type=None): + if act_type is None: + return nn.Identity() + elif act_type == 'relu': + return nn.ReLU(inplace=True) + elif act_type == 'lrelu': + return nn.LeakyReLU(0.1, inplace=True) + elif act_type == 'mish': + return nn.Mish(inplace=True) + elif act_type == 'silu': + return nn.SiLU(inplace=True) + + +def get_norm(in_dim, norm_type=None): + if norm_type is None: + return nn.Identity() + elif norm_type == 'BN': + return nn.BatchNorm2d(in_dim) + elif norm_type == 'GN': + return nn.GroupNorm(32, in_dim) + elif norm_type == 'IN': + return nn.InstanceNorm2d(in_dim) + + +class Conv(nn.Module): + def __init__(self, + c1, # in channels + c2, # out channels + k=1, # kernel size + p=0, # padding + s=1, # padding + d=1, # dilation + act_type='silu', + norm_type='BN', # activation + depthwise=False): + super(Conv, self).__init__() + convs = [] + add_bias = False if norm_type else True + if depthwise: + # depthwise conv + convs.append(nn.Conv2d(c1, c1, kernel_size=k, stride=s, padding=p, dilation=d, groups=c1, bias=add_bias)) + convs.append(get_norm(c1, norm_type)) + convs.append(get_activation(act_type)) + + # pointwise conv + convs.append(nn.Conv2d(c1, c2, kernel_size=1, stride=s, padding=0, dilation=d, groups=1, bias=add_bias)) + convs.append(get_norm(c2, norm_type)) + convs.append(get_activation(act_type)) + + else: + convs.append(nn.Conv2d(c1, c2, kernel_size=k, stride=s, padding=p, dilation=d, groups=1, bias=add_bias)) + convs.append(get_norm(c2, norm_type)) + convs.append(get_activation(act_type)) + + self.convs = nn.Sequential(*convs) + + + def forward(self, x): + return self.convs(x) + + +class ELANBlock(nn.Module): + """ + ELAN BLock of YOLOv7's backbone + """ + def __init__(self, in_dim, out_dim, expand_ratio=0.5, model_size='large', act_type='silu', depthwise=False): + super(ELANBlock, self).__init__() + inter_dim = int(in_dim * expand_ratio) + if model_size == 'tiny': + depth = 1 + elif model_size == 'large': + depth = 2 + elif model_size == 'huge': + depth = 3 + self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type) + self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type) + self.cv3 = nn.Sequential(*[ + Conv(inter_dim, inter_dim, k=3, p=1, act_type=act_type, depthwise=depthwise) + for _ in range(depth) + ]) + self.cv4 = nn.Sequential(*[ + Conv(inter_dim, inter_dim, k=3, p=1, act_type=act_type, depthwise=depthwise) + for _ in range(depth) + ]) + + self.out = Conv(inter_dim*4, out_dim, k=1) + + + + def forward(self, x): + """ + Input: + x: [B, C, H, W] + Output: + out: [B, 2C, H, W] + """ + x1 = self.cv1(x) + x2 = self.cv2(x) + x3 = self.cv3(x2) + x4 = self.cv4(x3) + + # [B, C, H, W] -> [B, 2C, H, W] + out = self.out(torch.cat([x1, x2, x3, x4], dim=1)) + + return out + + +class DownSample(nn.Module): + def __init__(self, in_dim, act_type='silu', norm_type='BN'): + super().__init__() + inter_dim = in_dim // 2 + self.mp = nn.MaxPool2d((2, 2), 2) + self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type) + self.cv2 = nn.Sequential( + Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type), + Conv(inter_dim, inter_dim, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type) + ) + + def forward(self, x): + """ + Input: + x: [B, C, H, W] + Output: + out: [B, C, H//2, W//2] + """ + # [B, C, H, W] -> [B, C//2, H//2, W//2] + x1 = self.cv1(self.mp(x)) + x2 = self.cv2(x) + + # [B, C, H//2, W//2] + out = torch.cat([x1, x2], dim=1) + + return out + + +# ELANNet-Tiny +class ELANNet_Tiny(nn.Module): + """ + ELAN-Net of YOLOv7-Tiny. + """ + def __init__(self, depthwise=False): + super(ELANNet_Tiny, self).__init__() + + # tiny backbone + self.layer_1 = Conv(3, 32, k=3, p=1, s=2, act_type='lrelu', depthwise=depthwise) # P1/2 + + self.layer_2 = nn.Sequential( + Conv(32, 64, k=3, p=1, s=2, act_type='lrelu', depthwise=depthwise), + ELANBlock(in_dim=64, out_dim=64, expand_ratio=0.5, + model_size='tiny', act_type='lrelu', depthwise=depthwise) # P2/4 + ) + self.layer_3 = nn.Sequential( + nn.MaxPool2d((2, 2), 2), + ELANBlock(in_dim=64, out_dim=128, expand_ratio=0.5, + model_size='tiny', act_type='lrelu', depthwise=depthwise) # P3/8 + ) + self.layer_4 = nn.Sequential( + nn.MaxPool2d((2, 2), 2), + ELANBlock(in_dim=128, out_dim=256, expand_ratio=0.5, + model_size='tiny', act_type='lrelu', depthwise=depthwise) # P4/16 + ) + self.layer_5 = nn.Sequential( + nn.MaxPool2d((2, 2), 2), + ELANBlock(in_dim=256, out_dim=512, expand_ratio=0.5, + model_size='tiny', act_type='lrelu', depthwise=depthwise) # P5/32 + ) + + + def forward(self, x): + c1 = self.layer_1(x) + c2 = self.layer_2(c1) + c3 = self.layer_3(c2) + c4 = self.layer_4(c3) + c5 = self.layer_5(c4) + + outputs = { + 'layer2': c3, + 'layer3': c4, + 'layer4': c5 + } + return outputs + + +# ELANNet-Large +class ELANNet_Large(nn.Module): + """ + ELAN-Net of YOLOv7. + """ + def __init__(self, depthwise=False): + super(ELANNet_Large, self).__init__() + + # large backbone + self.layer_1 = nn.Sequential( + Conv(3, 32, k=3, p=1, act_type='silu', depthwise=depthwise), + Conv(32, 64, k=3, p=1, s=2, act_type='silu', depthwise=depthwise), + Conv(64, 64, k=3, p=1, act_type='silu', depthwise=depthwise) # P1/2 + ) + self.layer_2 = nn.Sequential( + Conv(64, 128, k=3, p=1, s=2, act_type='silu', depthwise=depthwise), + ELANBlock(in_dim=128, out_dim=256, expand_ratio=0.5, + model_size='large',act_type='silu', depthwise=depthwise) # P2/4 + ) + self.layer_3 = nn.Sequential( + DownSample(in_dim=256, act_type='silu'), + ELANBlock(in_dim=256, out_dim=512, expand_ratio=0.5, + model_size='large',act_type='silu', depthwise=depthwise) # P3/8 + ) + self.layer_4 = nn.Sequential( + DownSample(in_dim=512, act_type='silu'), + ELANBlock(in_dim=512, out_dim=1024, expand_ratio=0.5, + model_size='large',act_type='silu', depthwise=depthwise) # P4/16 + ) + self.layer_5 = nn.Sequential( + DownSample(in_dim=1024, act_type='silu'), + ELANBlock(in_dim=1024, out_dim=1024, expand_ratio=0.25, + model_size='large',act_type='silu', depthwise=depthwise) # P5/32 + ) + + + def forward(self, x): + c1 = self.layer_1(x) + c2 = self.layer_2(c1) + c3 = self.layer_3(c2) + c4 = self.layer_4(c3) + c5 = self.layer_5(c4) + + outputs = { + 'layer2': c3, + 'layer3': c4, + 'layer4': c5 + } + return outputs + + +## build ELAN-Net +def build_elannet(model_name='elannet_large'): + # model + if model_name == 'elannet_large': + backbone = ELANNet_Large() + feat_dims = [512, 1024, 1024] + elif model_name == 'elannet_tiny': + backbone = ELANNet_Tiny() + feat_dims = [128, 256, 512] + + return backbone, feat_dims + + +# ====================== ShuffleNet-v2 ========================== +# ShuffleNet-v2 +def channel_shuffle(x, groups): + # type: (torch.Tensor, int) -> torch.Tensor + batchsize, num_channels, height, width = x.data.size() + channels_per_group = num_channels // groups + + # reshape + x = x.view(batchsize, groups, + channels_per_group, height, width) + + x = torch.transpose(x, 1, 2).contiguous() + + # flatten + x = x.view(batchsize, -1, height, width) + + return x + + +class ShuffleV2Block(nn.Module): + def __init__(self, inp, oup, stride): + super(ShuffleV2Block, self).__init__() + + if not (1 <= stride <= 3): + raise ValueError('illegal stride value') + self.stride = stride + + branch_features = oup // 2 + assert (self.stride != 1) or (inp == branch_features << 1) + + if self.stride > 1: + self.branch1 = nn.Sequential( + self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(inp), + nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + else: + self.branch1 = nn.Sequential() + + self.branch2 = nn.Sequential( + nn.Conv2d(inp if (self.stride > 1) else branch_features, + branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(branch_features), + nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + + @staticmethod + def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False): + return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i) + + def forward(self, x): + if self.stride == 1: + x1, x2 = x.chunk(2, dim=1) + out = torch.cat((x1, self.branch2(x2)), dim=1) + else: + out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) + + out = channel_shuffle(out, 2) + + return out + + +class ShuffleNetV2(nn.Module): + def __init__(self, + model_size='1.0x', + out_stages=(2, 3, 4), + with_last_conv=False, + kernal_size=3): + super(ShuffleNetV2, self).__init__() + print('model size is ', model_size) + + self.stage_repeats = [4, 8, 4] + self.model_size = model_size + self.out_stages = out_stages + self.with_last_conv = with_last_conv + self.kernal_size = kernal_size + if model_size == '0.5x': + self._stage_out_channels = [24, 48, 96, 192] + elif model_size == '1.0x': + self._stage_out_channels = [24, 116, 232, 464] + elif model_size == '1.5x': + self._stage_out_channels = [24, 176, 352, 704] + elif model_size == '2.0x': + self._stage_out_channels = [24, 244, 488, 976] + else: + raise NotImplementedError + + # building first layer + input_channels = 3 + output_channels = self._stage_out_channels[0] + self.conv1 = nn.Sequential( + nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False), + nn.BatchNorm2d(output_channels), + nn.ReLU(inplace=True), + ) + input_channels = output_channels + + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + stage_names = ['stage{}'.format(i) for i in [2, 3, 4]] + for name, repeats, output_channels in zip( + stage_names, self.stage_repeats, self._stage_out_channels[1:]): + seq = [ShuffleV2Block(input_channels, output_channels, 2)] + for i in range(repeats - 1): + seq.append(ShuffleV2Block(output_channels, output_channels, 1)) + setattr(self, name, nn.Sequential(*seq)) + input_channels = output_channels + + self._initialize_weights() + + + def _initialize_weights(self): + print('init weights...') + for name, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + if 'first' in name: + nn.init.normal_(m.weight, 0, 0.01) + else: + nn.init.normal_(m.weight, 0, 1.0 / m.weight.shape[1]) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0.0001) + nn.init.constant_(m.running_mean, 0) + elif isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0.0001) + nn.init.constant_(m.running_mean, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + + def forward(self, x): + x = self.conv1(x) + x = self.maxpool(x) + output = {} + for i in range(2, 5): + stage = getattr(self, 'stage{}'.format(i)) + x = stage(x) + if i in self.out_stages: + output['layer{}'.format(i)] = x + + return output + + +## build ShuffleNet-v2 +def build_shufflenetv2(model_size='1.0x'): + """Constructs a shufflenetv2 model. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + backbone = ShuffleNetV2(model_size=model_size) + feat_dims = backbone._stage_out_channels[1:] + + return backbone, feat_dims + + +# build backbone +def build_backbone(model_name='elannet_large'): + if model_name in ['elannet_nano', 'elannet_tiny', 'elannet_large', 'elannet_huge']: + return build_elannet(model_name) + + elif model_name in ['shufflenetv2_0.5x', 'shufflenetv2_1.0x']: + return build_shufflenetv2(model_size=model_name[-4:]) + + +if __name__ == '__main__': + import time + model, feats = build_backbone(model_name='shufflenetv2_1.0x') + x = torch.randn(1, 3, 224, 224) + t0 = time.time() + outputs = model(x) + t1 = time.time() + print('Time: ', t1 - t0) + for k in outputs.keys(): + print(outputs[k].shape) diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_basic.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_basic.py new file mode 100644 index 0000000000000000000000000000000000000000..6cd708b27cb5e628d93096bba31abc4d5a82a325 --- /dev/null +++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_basic.py @@ -0,0 +1,164 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SiLU(nn.Module): + """export-friendly version of nn.SiLU()""" + + @staticmethod + def forward(x): + return x * torch.sigmoid(x) + + +def get_conv2d(c1, c2, k, p, s, d, g, padding_mode='ZERO', bias=False): + if padding_mode == 'ZERO': + conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias) + elif padding_mode == 'SAME': + conv = Conv2dSamePadding(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias) + + return conv + + +def get_activation(act_type=None): + if act_type == 'relu': + return nn.ReLU(inplace=True) + elif act_type == 'lrelu': + return nn.LeakyReLU(0.1, inplace=True) + elif act_type == 'mish': + return nn.Mish(inplace=True) + elif act_type == 'silu': + return nn.SiLU(inplace=True) + + +def get_norm(norm_type, dim): + if norm_type == 'BN': + return nn.BatchNorm2d(dim) + elif norm_type == 'GN': + return nn.GroupNorm(num_groups=32, num_channels=dim) + + +# Conv2d with "SAME" padding +class Conv2dSamePadding(nn.Conv2d): + """ + A wrapper around :class:`torch.nn.Conv2d` to support "SAME" padding mode and more features. + """ + + def __init__(self, *args, **kwargs): + """ + Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`: + + Args: + norm (nn.Module, optional): a normalization layer + activation (callable(Tensor) -> Tensor): a callable activation function + + It assumes that norm layer is used before activation. + """ + + # parse padding mode + self.padding_method = kwargs.pop("padding", None) + if self.padding_method is None: + if len(args) >= 5: + self.padding_method = args[4] + else: + self.padding_method = 0 # default padding number + + if isinstance(self.padding_method, str): + if self.padding_method.upper() == "SAME": + # If the padding mode is `SAME`, it will be manually padded + super().__init__(*args, **kwargs, padding=0) + # stride + if isinstance(self.stride, int): + self.stride = [self.stride] * 2 + elif len(self.stride) == 1: + self.stride = [self.stride[0]] * 2 + # kernel size + if isinstance(self.kernel_size, int): + self.kernel_size = [self.kernel_size] * 2 + elif len(self.kernel_size) == 1: + self.kernel_size = [self.kernel_size[0]] * 2 + # dilation + if isinstance(self.dilation, int): + self.dilation = [self.dilation] * 2 + elif len(self.dilation) == 1: + self.dilation = [self.dilation[0]] * 2 + else: + raise ValueError("Unknown padding method: {}".format(self.padding_method)) + else: + super().__init__(*args, **kwargs, padding=self.padding_method) + + def forward(self, x): + if isinstance(self.padding_method, str): + if self.padding_method.upper() == "SAME": + input_h, input_w = x.shape[-2:] + stride_h, stride_w = self.stride + kernel_size_h, kernel_size_w = self.kernel_size + dilation_h, dilation_w = self.dilation + + output_h = math.ceil(input_h / stride_h) + output_w = math.ceil(input_w / stride_w) + + padding_needed_h = max( + 0, (output_h - 1) * stride_h + (kernel_size_h - 1) * dilation_h + 1 - input_h + ) + padding_needed_w = max( + 0, (output_w - 1) * stride_w + (kernel_size_w - 1) * dilation_w + 1 - input_w + ) + + left = padding_needed_w // 2 + right = padding_needed_w - left + top = padding_needed_h // 2 + bottom = padding_needed_h - top + + x = F.pad(x, [left, right, top, bottom]) + else: + raise ValueError("Unknown padding method: {}".format(self.padding_method)) + + x = super().forward(x) + + return x + + +# Basic conv layer +class Conv(nn.Module): + def __init__(self, + c1, # in channels + c2, # out channels + k=1, # kernel size + p=0, # padding + s=1, # padding + d=1, # dilation + act_type='', # activation + norm_type='', # normalization + padding_mode='ZERO', # padding mode: "ZERO" or "SAME" + depthwise=False): + super(Conv, self).__init__() + convs = [] + add_bias = False if norm_type else True + if depthwise: + convs.append(get_conv2d(c1, c1, k=k, p=p, s=s, d=d, g=c1, padding_mode=padding_mode, bias=add_bias)) + # depthwise conv + if norm_type: + convs.append(get_norm(norm_type, c1)) + if act_type: + convs.append(get_activation(act_type)) + # pointwise conv + convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias)) + if norm_type: + convs.append(get_norm(norm_type, c2)) + if act_type: + convs.append(get_activation(act_type)) + + else: + convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=1, padding_mode=padding_mode, bias=add_bias)) + if norm_type: + convs.append(get_norm(norm_type, c2)) + if act_type: + convs.append(get_activation(act_type)) + + self.convs = nn.Sequential(*convs) + + + def forward(self, x): + return self.convs(x) diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_fpn.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..1fd8a8b254b830195f8c606ce084c0fbfc85c578 --- /dev/null +++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_fpn.py @@ -0,0 +1,252 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +try: + from yolo_free_basic import Conv +except: + from .yolo_free_basic import Conv + + +class ELANBlock(nn.Module): + """ + ELAN BLock of YOLOv7's head + """ + def __init__(self, in_dim, out_dim, fpn_size='large', depthwise=False, act_type='silu', norm_type='BN'): + super(ELANBlock, self).__init__() + if fpn_size == 'tiny' or fpn_size =='nano': + e1, e2 = 0.25, 1.0 + width = 2 + depth = 1 + elif fpn_size == 'large': + e1, e2 = 0.5, 0.5 + width = 4 + depth = 1 + elif fpn_size == 'huge': + e1, e2 = 0.5, 0.5 + width = 4 + depth = 2 + inter_dim = int(in_dim * e1) + inter_dim2 = int(inter_dim * e2) + self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type) + self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type) + self.cv3 = nn.ModuleList() + for idx in range(width): + if idx == 0: + cvs = [Conv(inter_dim, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)] + else: + cvs = [Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)] + # deeper + if depth > 1: + for _ in range(1, depth): + cvs.append(Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)) + self.cv3.append(nn.Sequential(*cvs)) + else: + self.cv3.append(cvs[0]) + + self.out = Conv(inter_dim*2+inter_dim2*len(self.cv3), out_dim, k=1, act_type=act_type, norm_type=norm_type) + + + def forward(self, x): + """ + Input: + x: [B, C_in, H, W] + Output: + out: [B, C_out, H, W] + """ + x1 = self.cv1(x) + x2 = self.cv2(x) + inter_outs = [x1, x2] + for m in self.cv3: + y1 = inter_outs[-1] + y2 = m(y1) + inter_outs.append(y2) + + # [B, C_in, H, W] -> [B, C_out, H, W] + out = self.out(torch.cat(inter_outs, dim=1)) + + return out + + +class DownSample(nn.Module): + def __init__(self, in_dim, depthwise=False, act_type='silu', norm_type='BN'): + super().__init__() + inter_dim = in_dim + self.mp = nn.MaxPool2d((2, 2), 2) + self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type) + self.cv2 = nn.Sequential( + Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type), + Conv(inter_dim, inter_dim, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type, depthwise=depthwise) + ) + + def forward(self, x): + """ + Input: + x: [B, C, H, W] + Output: + out: [B, 2C, H//2, W//2] + """ + # [B, C, H, W] -> [B, C//2, H//2, W//2] + x1 = self.cv1(self.mp(x)) + x2 = self.cv2(x) + + # [B, C, H//2, W//2] + out = torch.cat([x1, x2], dim=1) + + return out + + +# PaFPN-ELAN +class PaFPNELAN(nn.Module): + def __init__(self, + in_dims=[512, 1024, 1024], + out_dim=256, + fpn_size='large', + depthwise=False, + norm_type='BN', + act_type='silu'): + super(PaFPNELAN, self).__init__() + self.in_dims = in_dims + self.out_dim = out_dim + c3, c4, c5 = in_dims + if fpn_size == 'tiny': + width = 0.5 + elif fpn_size == 'nano': + assert depthwise + width = 0.5 + elif fpn_size == 'large': + width = 1.0 + elif fpn_size == 'huge': + width = 1.25 + + # top dwon + ## P5 -> P4 + self.cv1 = Conv(c5, int(256 * width), k=1, norm_type=norm_type, act_type=act_type) + self.cv2 = Conv(c4, int(256 * width), k=1, norm_type=norm_type, act_type=act_type) + self.head_elan_1 = ELANBlock(in_dim=int(256 * width) + int(256 * width), + out_dim=int(256 * width), + fpn_size=fpn_size, + depthwise=depthwise, + norm_type=norm_type, + act_type=act_type) + + # P4 -> P3 + self.cv3 = Conv(int(256 * width), int(128 * width), k=1, norm_type=norm_type, act_type=act_type) + self.cv4 = Conv(c3, int(128 * width), k=1, norm_type=norm_type, act_type=act_type) + self.head_elan_2 = ELANBlock(in_dim=int(128 * width) + int(128 * width), + out_dim=int(128 * width), # 128 + fpn_size=fpn_size, + depthwise=depthwise, + norm_type=norm_type, + act_type=act_type) + + # bottom up + # P3 -> P4 + if fpn_size == 'large' or fpn_size == 'huge': + self.mp1 = DownSample(int(128 * width), act_type=act_type, + norm_type=norm_type, depthwise=depthwise) + elif fpn_size == 'tiny': + self.mp1 = Conv(int(128 * width), int(256 * width), k=3, p=1, s=2, + act_type=act_type, norm_type=norm_type, depthwise=depthwise) + elif fpn_size == 'nano': + self.mp1 = nn.Sequential( + nn.MaxPool2d((2, 2), 2), + Conv(int(128 * width), int(256 * width), k=1, act_type=act_type, norm_type=norm_type) + ) + self.head_elan_3 = ELANBlock(in_dim=int(256 * width) + int(256 * width), + out_dim=int(256 * width), # 256 + fpn_size=fpn_size, + depthwise=depthwise, + norm_type=norm_type, + act_type=act_type) + + # P4 -> P5 + if fpn_size == 'large' or fpn_size == 'huge': + self.mp2 = DownSample(int(256 * width), act_type=act_type, + norm_type=norm_type, depthwise=depthwise) + elif fpn_size == 'tiny': + self.mp2 = Conv(int(256 * width), int(512 * width), k=3, p=1, s=2, + act_type=act_type, norm_type=norm_type, depthwise=depthwise) + elif fpn_size == 'nano': + self.mp2 = nn.Sequential( + nn.MaxPool2d((2, 2), 2), + Conv(int(256 * width), int(512 * width), k=1, act_type=act_type, norm_type=norm_type) + ) + self.head_elan_4 = ELANBlock(in_dim=int(512 * width) + c5, + out_dim=int(512 * width), # 512 + fpn_size=fpn_size, + depthwise=depthwise, + norm_type=norm_type, + act_type=act_type) + + self.head_conv_1 = Conv(int(128 * width), int(256 * width), k=3, p=1, + act_type=act_type, norm_type=norm_type, depthwise=depthwise) + self.head_conv_2 = Conv(int(256 * width), int(512 * width), k=3, p=1, + act_type=act_type, norm_type=norm_type, depthwise=depthwise) + self.head_conv_3 = Conv(int(512 * width), int(1024 * width), k=3, p=1, + act_type=act_type, norm_type=norm_type, depthwise=depthwise) + # output proj layers + if self.out_dim is not None: + self.out_layers = nn.ModuleList([ + Conv(in_dim, self.out_dim, k=1, + norm_type=norm_type, act_type=act_type) + for in_dim in [int(256 * width), int(512 * width), int(1024 * width)] + ]) + + + def forward(self, features): + c3, c4, c5 = features + + # Top down + ## P5 -> P4 + c6 = self.cv1(c5) + c7 = F.interpolate(c6, scale_factor=2.0) + c8 = torch.cat([c7, self.cv2(c4)], dim=1) + c9 = self.head_elan_1(c8) + ## P4 -> P3 + c10 = self.cv3(c9) + c11 = F.interpolate(c10, scale_factor=2.0) + c12 = torch.cat([c11, self.cv4(c3)], dim=1) + c13 = self.head_elan_2(c12) + + # Bottom up + # p3 -> P4 + c14 = self.mp1(c13) + c15 = torch.cat([c14, c9], dim=1) + c16 = self.head_elan_3(c15) + # P4 -> P5 + c17 = self.mp2(c16) + c18 = torch.cat([c17, c5], dim=1) + c19 = self.head_elan_4(c18) + + c20 = self.head_conv_1(c13) + c21 = self.head_conv_2(c16) + c22 = self.head_conv_3(c19) + + out_feats = [c20, c21, c22] # [P3, P4, P5] + + # output proj layers + if self.out_dim is not None: + out_feats_proj = [] + for feat, layer in zip(out_feats, self.out_layers): + out_feats_proj.append(layer(feat)) + return out_feats_proj + + return out_feats + + +def build_fpn(cfg, in_dims, out_dim): + model = cfg['fpn'] + print('==============================') + print('FPN: {}'.format(model)) + # build neck + if model == 'pafpn_elan': + fpn_net = PaFPNELAN(in_dims=in_dims, + out_dim=out_dim, + fpn_size=cfg['fpn_size'], + depthwise=cfg['fpn_depthwise'], + norm_type=cfg['fpn_norm'], + act_type=cfg['fpn_act']) + + + return fpn_net diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_head.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_head.py new file mode 100644 index 0000000000000000000000000000000000000000..34028d51fc5bb9c89830d628bed958e7a923a0fe --- /dev/null +++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_head.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +try: + from yolo_free_basic import Conv +except: + from .yolo_free_basic import Conv + + +class DecoupledHead(nn.Module): + def __init__(self, cfg): + super().__init__() + + print('==============================') + print('Head: Decoupled Head') + self.num_cls_head=cfg['num_cls_head'] + self.num_reg_head=cfg['num_reg_head'] + self.act_type=cfg['head_act'] + self.norm_type=cfg['head_norm'] + self.head_dim = cfg['head_dim'] + + self.cls_feats = nn.Sequential(*[Conv(self.head_dim, + self.head_dim, + k=3, p=1, s=1, + act_type=self.act_type, + norm_type=self.norm_type, + depthwise=cfg['head_depthwise']) for _ in range(self.num_cls_head)]) + self.reg_feats = nn.Sequential(*[Conv(self.head_dim, + self.head_dim, + k=3, p=1, s=1, + act_type=self.act_type, + norm_type=self.norm_type, + depthwise=cfg['head_depthwise']) for _ in range(self.num_reg_head)]) + + + def forward(self, x): + """ + in_feats: (Tensor) [B, C, H, W] + """ + cls_feats = self.cls_feats(x) + reg_feats = self.reg_feats(x) + + return cls_feats, reg_feats + + +# build detection head +def build_head(cfg): + head = DecoupledHead(cfg) + + return head + \ No newline at end of file diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_neck.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_neck.py new file mode 100644 index 0000000000000000000000000000000000000000..79886233d9532bade2d1ca96af31fb96176ddc26 --- /dev/null +++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_neck.py @@ -0,0 +1,164 @@ +import torch +import torch.nn as nn + +try: + from yolo_free_basic import Conv +except: + from .yolo_free_basic import Conv + + +# Spatial Pyramid Pooling +class SPP(nn.Module): + """ + Spatial Pyramid Pooling + """ + def __init__(self, in_dim, out_dim, expand_ratio=0.5, pooling_size=[5, 9, 13], norm_type='BN', act_type='relu'): + super(SPP, self).__init__() + inter_dim = int(in_dim * expand_ratio) + self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type) + self.m = nn.ModuleList( + [ + nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2) + for k in pooling_size + ] + ) + + self.cv2 = Conv(inter_dim*(len(pooling_size) + 1), out_dim, k=1, act_type=act_type, norm_type=norm_type) + + def forward(self, x): + x = self.cv1(x) + x = torch.cat([x] + [m(x) for m in self.m], dim=1) + x = self.cv2(x) + + return x + + +# SPP block with CSP module +class SPPBlock(nn.Module): + """ + Spatial Pyramid Pooling Block + """ + def __init__(self, + in_dim, + out_dim, + expand_ratio=0.5, + pooling_size=[5, 9, 13], + act_type='lrelu', + norm_type='BN', + depthwise=False + ): + super(SPPBlockCSP, self).__init__() + inter_dim = int(in_dim * expand_ratio) + self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type) + self.cv2 = nn.Sequential( + SPP(inter_dim, + inter_dim, + expand_ratio=1.0, + pooling_size=pooling_size, + act_type=act_type, + norm_type=norm_type), + ) + self.cv3 = Conv(inter_dim * 2, out_dim, k=1, act_type=act_type, norm_type=norm_type) + + + def forward(self, x): + x1 = self.cv1(x) + x2 = self.cv2(x) + y = self.cv3(torch.cat([x1, x2], dim=1)) + + return y + + +# SPP block with CSP module +class SPPBlockCSP(nn.Module): + """ + CSP Spatial Pyramid Pooling Block + """ + def __init__(self, + in_dim, + out_dim, + expand_ratio=0.5, + pooling_size=[5, 9, 13], + act_type='lrelu', + norm_type='BN', + depthwise=False + ): + super(SPPBlockCSP, self).__init__() + inter_dim = int(in_dim * expand_ratio) + self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type) + self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type) + self.m = nn.Sequential( + Conv(inter_dim, inter_dim, k=3, p=1, + act_type=act_type, norm_type=norm_type, + depthwise=depthwise), + SPP(inter_dim, + inter_dim, + expand_ratio=1.0, + pooling_size=pooling_size, + act_type=act_type, + norm_type=norm_type), + Conv(inter_dim, inter_dim, k=3, p=1, + act_type=act_type, norm_type=norm_type, + depthwise=depthwise) + ) + self.cv3 = Conv(inter_dim * 2, out_dim, k=1, act_type=act_type, norm_type=norm_type) + + + def forward(self, x): + x1 = self.cv1(x) + x2 = self.cv2(x) + x3 = self.m(x2) + y = self.cv3(torch.cat([x1, x3], dim=1)) + + return y + + +# Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher +class SPPF(nn.Module): + def __init__(self, in_dim, out_dim, k=5): # equivalent to SPP(k=(5, 9, 13)) + super().__init__() + inter_dim = in_dim // 2 # hidden channels + self.cv1 = Conv(in_dim, inter_dim, k=1) + self.cv2 = Conv(inter_dim * 4, out_dim, k=1) + self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2) + + def forward(self, x): + x = self.cv1(x) + y1 = self.m(x) + y2 = self.m(y1) + + return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1)) + + +def build_neck(cfg, in_dim, out_dim): + model = cfg['neck'] + # build neck + if model == 'spp_block': + neck = SPPBlock( + in_dim, out_dim, + expand_ratio=cfg['expand_ratio'], + pooling_size=cfg['pooling_size'], + act_type=cfg['neck_act'], + norm_type=cfg['neck_norm'], + depthwise=cfg['neck_depthwise'] + ) + + elif model == 'spp_block_csp': + neck = SPPBlockCSP( + in_dim, out_dim, + expand_ratio=cfg['expand_ratio'], + pooling_size=cfg['pooling_size'], + act_type=cfg['neck_act'], + norm_type=cfg['neck_norm'], + depthwise=cfg['neck_depthwise'] + ) + + elif model == 'sppf': + neck = SPPF(in_dim, out_dim, k=cfg['pooling_size']) + + + return neck + + +if __name__ == '__main__': + pass diff --git a/models/backbone/backbone_3d/__init__.py b/models/backbone/backbone_3d/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/backbone/backbone_3d/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_3d/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6aa230eb1b1c3c542f9617d600544184bf3e9a02 Binary files /dev/null and b/models/backbone/backbone_3d/__pycache__/__init__.cpython-310.pyc differ diff --git a/models/backbone/backbone_3d/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_3d/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d348ed59c03ca2f80c56eff69fc611b851b62b2c Binary files /dev/null and b/models/backbone/backbone_3d/__pycache__/__init__.cpython-37.pyc differ diff --git a/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-310.pyc b/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf7c1ab3b275b06a8e6685679b90e3fcd62a893a Binary files /dev/null and b/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-310.pyc differ diff --git a/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-37.pyc b/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9632b255c13e207b9714f8422649b24cf487ae39 Binary files /dev/null and b/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-37.pyc differ diff --git a/models/backbone/backbone_3d/backbone_3d.py b/models/backbone/backbone_3d/backbone_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..7b2e489d663123b6302df84866e1e7ace5d1d61c --- /dev/null +++ b/models/backbone/backbone_3d/backbone_3d.py @@ -0,0 +1,68 @@ +import torch.nn as nn +import torch.nn.functional as F + +from .cnn_3d import build_3d_cnn + + +class Conv(nn.Module): + def __init__(self, in_dim, out_dim, k=3, p=1, s=1, depthwise=False): + super().__init__() + if depthwise: + self.convs = nn.Sequential( + nn.Conv2d(in_dim, in_dim, kernel_size=k, padding=p, stride=s, groups=in_dim, bias=False), + nn.BatchNorm2d(out_dim), + nn.ReLU(inplace=True), + nn.Conv2d(in_dim, out_dim, kernel_size=1, groups=in_dim, bias=False), + nn.BatchNorm2d(out_dim), + nn.ReLU(inplace=True), + ) + else: + self.convs = nn.Sequential( + nn.Conv2d(in_dim, out_dim, kernel_size=k, padding=p, stride=s, bias=False), + nn.BatchNorm2d(out_dim), + nn.ReLU(inplace=True) + ) + + def forward(self, x): + return self.convs(x) + + +class ConvBlocks(nn.Module): + def __init__(self, in_dim, out_dim, nblocks=1, depthwise=False): + super().__init__() + assert in_dim == out_dim + + conv_block = [] + for _ in range(nblocks): + conv_block.append( + Conv(in_dim, out_dim, k=3, p=1, s=1, depthwise=depthwise) + ) + self.conv_block = nn.Sequential(*conv_block) + + def forward(self, x): + return self.conv_block(x) + + +class Backbone3D(nn.Module): + def __init__(self, cfg, pretrained=False): + super().__init__() + self.cfg = cfg + + # 3D CNN + self.backbone, self.feat_dim = build_3d_cnn(cfg, pretrained) + + + def forward(self, x): + """ + Input: + x: (Tensor) -> [B, C, T, H, W] + Output: + y: (List) -> [ + (Tensor) -> [B, C1, H1, W1], + (Tensor) -> [B, C2, H2, W2], + (Tensor) -> [B, C3, H3, W3] + ] + """ + feat = self.backbone(x) + + return feat diff --git a/models/backbone/backbone_3d/cnn_3d/__init__.py b/models/backbone/backbone_3d/cnn_3d/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..614878066212084d373f72e3b7ecfdaec30df8cc --- /dev/null +++ b/models/backbone/backbone_3d/cnn_3d/__init__.py @@ -0,0 +1,30 @@ +from .resnet import build_resnet_3d +from .resnext import build_resnext_3d +from .shufflnetv2 import build_shufflenetv2_3d + + +def build_3d_cnn(cfg, pretrained=False): + print('==============================') + print('3D Backbone: {}'.format(cfg['backbone_3d'].upper())) + print('--pretrained: {}'.format(pretrained)) + + if 'resnet' in cfg['backbone_3d']: + model, feat_dims = build_resnet_3d( + model_name=cfg['backbone_3d'], + pretrained=pretrained + ) + elif 'resnext' in cfg['backbone_3d']: + model, feat_dims = build_resnext_3d( + model_name=cfg['backbone_3d'], + pretrained=pretrained + ) + elif 'shufflenetv2' in cfg['backbone_3d']: + model, feat_dims = build_shufflenetv2_3d( + model_size=cfg['model_size'], + pretrained=pretrained + ) + else: + print('Unknown Backbone ...') + exit() + + return model, feat_dims diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..723162398332cc110fc6b3ef4915a967c92a8c45 Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-310.pyc differ diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46754a1bf132221c71513dc9adabb8c05fa7d9d6 Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-37.pyc differ diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-310.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbf640180bf32c6ded3f274e56f0a13abe9633fb Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-310.pyc differ diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-37.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e67e1df865964dc508ce518c217b151fa1b5455 Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-37.pyc differ diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-310.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9bf6e68d2684421b980f1e5dcf558f2332f82768 Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-310.pyc differ diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-37.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41d9ac061d5e655de8843cbbb485514d31d590a5 Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-37.pyc differ diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-310.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62609b79498b2b3f6c481c09b6feba58e473fe07 Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-310.pyc differ diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-37.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..072c02b8f79f1ac6c818bc623f9cc85b203fc61c Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-37.pyc differ diff --git a/models/backbone/backbone_3d/cnn_3d/resnet.py b/models/backbone/backbone_3d/cnn_3d/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..81d28a68bca418e017ac553757eecbbfe8a93a08 --- /dev/null +++ b/models/backbone/backbone_3d/cnn_3d/resnet.py @@ -0,0 +1,309 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +from torch.hub import load_state_dict_from_url +from functools import partial + +__all__ = [ + 'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', + 'resnet152', 'resnet200' +] + + +model_urls = { + "resnet18": "https://github.com/yjh0410/YOWOF/releases/download/yowof-weight/resnet-18-kinetics.pth", + "resnet34": "https://github.com/yjh0410/YOWOF/releases/download/yowof-weight/resnet-34-kinetics.pth", + "resnet50": "https://github.com/yjh0410/YOWOF/releases/download/yowof-weight/resnet-50-kinetics.pth", + "resnet101": "https://github.com/yjh0410/YOWOF/releases/download/yowof-weight/resnet-101-kinetics.pth" +} + + + +def conv3x3x3(in_planes, out_planes, stride=1): + # 3x3x3 convolution with padding + return nn.Conv3d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False) + + +def downsample_basic_block(x, planes, stride): + out = F.avg_pool3d(x, kernel_size=1, stride=stride) + zero_pads = torch.Tensor( + out.size(0), planes - out.size(1), out.size(2), out.size(3), + out.size(4)).zero_() + + if isinstance(out.data, torch.cuda.FloatTensor): + zero_pads = zero_pads.cuda() + zero_pads = zero_pads.to(out.data.device) + out = Variable(torch.cat([out.data, zero_pads], dim=1)) + + return out + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm3d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3x3(planes, planes) + self.bn2 = nn.BatchNorm3d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm3d(planes) + self.conv2 = nn.Conv3d( + planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm3d(planes) + self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm3d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + + def __init__(self, + block, + layers, + shortcut_type='B'): + self.inplanes = 64 + super(ResNet, self).__init__() + self.conv1 = nn.Conv3d( + 3, + 64, + kernel_size=7, + stride=(1, 2, 2), + padding=(3, 3, 3), + bias=False) + self.bn1 = nn.BatchNorm3d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) + self.layer2 = self._make_layer( + block, 128, layers[1], shortcut_type, stride=2) + self.layer3 = self._make_layer( + block, 256, layers[2], shortcut_type, stride=2) + self.layer4 = self._make_layer( + block, 512, layers[3], shortcut_type, stride=2) + # self.avgpool = nn.AvgPool3d((2, 1, 1), stride=1) + + for m in self.modules(): + if isinstance(m, nn.Conv3d): + m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out') + elif isinstance(m, nn.BatchNorm3d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + if shortcut_type == 'A': + downsample = partial( + downsample_basic_block, + planes=planes * block.expansion, + stride=stride) + else: + downsample = nn.Sequential( + nn.Conv3d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), nn.BatchNorm3d(planes * block.expansion)) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + c1 = self.conv1(x) + c1 = self.bn1(c1) + c1 = self.relu(c1) + c2 = self.maxpool(c1) + + c2 = self.layer1(c2) + c3 = self.layer2(c2) + c4 = self.layer3(c3) + c5 = self.layer4(c4) + + if c5.size(2) > 1: + c5 = torch.mean(c5, dim=2, keepdim=True) + + return c5.squeeze(2) + + +def load_weight(model, arch): + print('Loading pretrained weight ...') + url = model_urls[arch] + # check + if url is None: + print('No pretrained weight for 3D CNN: {}'.format(arch.upper())) + return model + + print('Loading 3D backbone pretrained weight: {}'.format(arch.upper())) + # checkpoint state dict + checkpoint = load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) + checkpoint_state_dict = checkpoint.pop('state_dict') + + # model state dict + model_state_dict = model.state_dict() + # reformat checkpoint_state_dict: + new_state_dict = {} + for k in checkpoint_state_dict.keys(): + v = checkpoint_state_dict[k] + new_state_dict[k[7:]] = v + + # check + for k in list(new_state_dict.keys()): + if k in model_state_dict: + shape_model = tuple(model_state_dict[k].shape) + shape_checkpoint = tuple(new_state_dict[k].shape) + if shape_model != shape_checkpoint: + new_state_dict.pop(k) + # print(k) + else: + new_state_dict.pop(k) + # print(k) + + model.load_state_dict(new_state_dict) + + return model + + +def resnet18(pretrained=False, **kwargs): + """Constructs a 3D ResNet-18 model.""" + + model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) + + if pretrained: + model = load_weight(model, 'resnet18') + + return model + + +def resnet34(pretrained=False, **kwargs): + """Constructs a 3D ResNet-34 model.""" + + model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) + + if pretrained: + model = load_weight(model, 'resnet34') + + return model + + +def resnet50(pretrained=False, **kwargs): + """Constructs a 3D ResNet-50 model. """ + + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + + if pretrained: + model = load_weight(model, 'resnet50') + + return model + + +def resnet101(pretrained=False, **kwargs): + """Constructs a 3D ResNet-101 model.""" + + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) + + if pretrained: + model = load_weight(model, 'resnet101') + + return model + + +# build 3D resnet +def build_resnet_3d(model_name='resnet18', pretrained=False): + if model_name == 'resnet18': + model = resnet18(pretrained=pretrained, shortcut_type='A') + feats = 512 + + elif model_name == 'resnet50': + model = resnet50(pretrained=pretrained, shortcut_type='B') + feats = 2048 + + elif model_name == 'resnet101': + model = resnet101(pretrained=pretrained, shortcut_type='b') + feats = 2048 + + return model, feats + + +if __name__ == '__main__': + import time + model, feats = build_resnet_3d(model_name='resnet18', pretrained=True) + if torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + model = model.to(device) + + x = torch.randn(1, 3, 16, 64, 64).to(device) + # star time + t0 = time.time() + out = model(x) + print('time', time.time() - t0) + + print(out.shape) diff --git a/models/backbone/backbone_3d/cnn_3d/resnext.py b/models/backbone/backbone_3d/cnn_3d/resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..1f0c57081af9139dfb1fef32101d3e386d5989c1 --- /dev/null +++ b/models/backbone/backbone_3d/cnn_3d/resnext.py @@ -0,0 +1,286 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +from torch.hub import load_state_dict_from_url +from functools import partial + +__all__ = ['resnext50', 'resnext101', 'resnet152'] + + +model_urls = { + "resnext50": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/resnext-50-kinetics.pth", + "resnext101": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/resnext-101-kinetics.pth", + "resnext152": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/resnext-152-kinetics.pth" +} + + + +def downsample_basic_block(x, planes, stride): + out = F.avg_pool3d(x, kernel_size=1, stride=stride) + zero_pads = torch.Tensor( + out.size(0), planes - out.size(1), out.size(2), out.size(3), + out.size(4)).zero_() + + if isinstance(out.data, torch.cuda.FloatTensor): + zero_pads = zero_pads.cuda() + zero_pads = zero_pads.to(out.data.device) + out = Variable(torch.cat([out.data, zero_pads], dim=1)) + + return out + + +class ResNeXtBottleneck(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, cardinality, stride=1, + downsample=None): + super(ResNeXtBottleneck, self).__init__() + mid_planes = cardinality * int(planes / 32) + self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm3d(mid_planes) + self.conv2 = nn.Conv3d( + mid_planes, + mid_planes, + kernel_size=3, + stride=stride, + padding=1, + groups=cardinality, + bias=False) + self.bn2 = nn.BatchNorm3d(mid_planes) + self.conv3 = nn.Conv3d( + mid_planes, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm3d(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNeXt(nn.Module): + + def __init__(self, + block, + layers, + shortcut_type='B', + cardinality=32): + self.inplanes = 64 + super(ResNeXt, self).__init__() + self.conv1 = nn.Conv3d( + 3, + 64, + kernel_size=7, + stride=(1, 2, 2), + padding=(3, 3, 3), + bias=False) + self.bn1 = nn.BatchNorm3d(64) + self.relu = nn.ReLU(inplace=True) + + self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) + + self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, + cardinality) + + self.layer2 = self._make_layer( + block, 256, layers[1], shortcut_type, cardinality, stride=2) + + self.layer3 = self._make_layer( + block, 512, layers[2], shortcut_type, cardinality, stride=2) + + self.layer4 = self._make_layer( + block, 1024, layers[3], shortcut_type, cardinality, stride=2) + + for m in self.modules(): + if isinstance(m, nn.Conv3d): + m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out') + elif isinstance(m, nn.BatchNorm3d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_layer(self, + block, + planes, + blocks, + shortcut_type, + cardinality, + stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + if shortcut_type == 'A': + downsample = partial( + downsample_basic_block, + planes=planes * block.expansion, + stride=stride) + else: + downsample = nn.Sequential( + nn.Conv3d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), nn.BatchNorm3d(planes * block.expansion)) + + layers = [] + layers.append( + block(self.inplanes, planes, cardinality, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, cardinality)) + + return nn.Sequential(*layers) + + def forward(self, x): + c1 = self.conv1(x) + c1 = self.bn1(c1) + c1 = self.relu(c1) + c2 = self.maxpool(c1) + + c2 = self.layer1(c2) + c3 = self.layer2(c2) + c4 = self.layer3(c3) + c5 = self.layer4(c4) + #fix + #if c5.size(2) > 1: + if c5.size(2) > 1: + c5 = torch.mean(c5, dim=2, keepdim=True) + + return c5.squeeze(2) + + +def load_weight(model, arch): + print('Loading pretrained weight ...') + url = model_urls[arch] + # check + if url is None: + print('No pretrained weight for 3D CNN: {}'.format(arch.upper())) + return model + + print('Loading 3D backbone pretrained weight: {}'.format(arch.upper())) + # checkpoint state dict + checkpoint = load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) + checkpoint_state_dict = checkpoint.pop('state_dict') + + # model state dict + model_state_dict = model.state_dict() + # reformat checkpoint_state_dict: + new_state_dict = {} + for k in checkpoint_state_dict.keys(): + v = checkpoint_state_dict[k] + new_state_dict[k[7:]] = v + + # check + for k in list(new_state_dict.keys()): + if k in model_state_dict: + shape_model = tuple(model_state_dict[k].shape) + shape_checkpoint = tuple(new_state_dict[k].shape) + if shape_model != shape_checkpoint: + new_state_dict.pop(k) + # print(k) + else: + new_state_dict.pop(k) + # print(k) + + model.load_state_dict(new_state_dict) + + return model + + +def resnext50(pretrained=False, **kwargs): + """Constructs a ResNet-50 model. + """ + model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs) + + if pretrained: + model = load_weight(model, 'resnext50') + + return model + + +def resnext101(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + """ + model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs) + + if pretrained: + model = load_weight(model, 'resnext101') + + return model + + +def resnext152(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + """ + model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs) + + if pretrained: + model = load_weight(model, 'resnext152') + + return model + + +# build 3D resnet +def build_resnext_3d(model_name='resnext101', pretrained=True): + if model_name == 'resnext50': + model = resnext50(pretrained=pretrained) + feats = 2048 + + elif model_name == 'resnext101': + model = resnext101(pretrained=pretrained) + feats = 2048 + + elif model_name == 'resnext152': + model = resnext152(pretrained=pretrained) + feats = 2048 + + return model, feats + + +if __name__ == '__main__': + import time + from thop import profile + + if torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + + model, feats = build_resnext_3d(model_name='resnext50', pretrained=False) + model = model.to(device) + + x = torch.randn(1, 3, 32, 256, 256).to(device) + # star time + t0 = time.time() + # inference + outs = model(x) + for y in outs: + print(y.shape) + # end time + print('Inference time: {}'.format(time.time() - t0)) + + # FLOPs & Params + print('==============================') + flops, params = profile(model, inputs=(x, ), verbose=False) + print('==============================') + print('GFLOPs : {:.2f}'.format(flops / 1e9)) + print('Params : {:.2f} M'.format(params / 1e6)) diff --git a/models/backbone/backbone_3d/cnn_3d/shufflnetv2.py b/models/backbone/backbone_3d/cnn_3d/shufflnetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..9fa682318f649aac5ec679f949a9fde35a47a3cc --- /dev/null +++ b/models/backbone/backbone_3d/cnn_3d/shufflnetv2.py @@ -0,0 +1,236 @@ +'''ShuffleNetV2 in PyTorch. + +See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details. +''' + +import torch +import torch.nn as nn +from torch.hub import load_state_dict_from_url + + +__all__ = ['resnext50', 'resnext101', 'resnet152'] + + +model_urls = { + "0.25x": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/kinetics_shufflenetv2_0.25x_RGB_16_best.pth", + "1.0x": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/kinetics_shufflenetv2_1.0x_RGB_16_best.pth", + "1.5x": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/kinetics_shufflenetv2_1.5x_RGB_16_best.pth", + "2.0x": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/kinetics_shufflenetv2_2.0x_RGB_16_best.pth", +} + + +# basic component +def conv_bn(inp, oup, stride): + return nn.Sequential( + nn.Conv3d(inp, oup, kernel_size=3, stride=stride, padding=(1,1,1), bias=False), + nn.BatchNorm3d(oup), + nn.ReLU(inplace=True) + ) + + +def conv_1x1x1_bn(inp, oup): + return nn.Sequential( + nn.Conv3d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm3d(oup), + nn.ReLU(inplace=True) + ) + + +def channel_shuffle(x, groups): + '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]''' + batchsize, num_channels, depth, height, width = x.data.size() + channels_per_group = num_channels // groups + # reshape + x = x.view(batchsize, groups, + channels_per_group, depth, height, width) + #permute + x = x.permute(0,2,1,3,4,5).contiguous() + # flatten + x = x.view(batchsize, num_channels, depth, height, width) + return x + + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + oup_inc = oup//2 + + if self.stride == 1: + self.banch2 = nn.Sequential( + # pw + nn.Conv3d(oup_inc, oup_inc, 1, 1, 0, bias=False), + nn.BatchNorm3d(oup_inc), + nn.ReLU(inplace=True), + # dw + nn.Conv3d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), + nn.BatchNorm3d(oup_inc), + # pw-linear + nn.Conv3d(oup_inc, oup_inc, 1, 1, 0, bias=False), + nn.BatchNorm3d(oup_inc), + nn.ReLU(inplace=True) + ) + + else: + self.banch1 = nn.Sequential( + # dw + nn.Conv3d(inp, inp, 3, stride, 1, groups=inp, bias=False), + nn.BatchNorm3d(inp), + # pw-linear + nn.Conv3d(inp, oup_inc, 1, 1, 0, bias=False), + nn.BatchNorm3d(oup_inc), + nn.ReLU(inplace=True) + ) + self.banch2 = nn.Sequential( + # pw + nn.Conv3d(inp, oup_inc, 1, 1, 0, bias=False), + nn.BatchNorm3d(oup_inc), + nn.ReLU(inplace=True), + # dw + nn.Conv3d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False), + nn.BatchNorm3d(oup_inc), + # pw-linear + nn.Conv3d(oup_inc, oup_inc, 1, 1, 0, bias=False), + nn.BatchNorm3d(oup_inc), + nn.ReLU(inplace=True) + ) + + + @staticmethod + def _concat(x, out): + # concatenate along channel axis + return torch.cat((x, out), 1) + + + def forward(self, x): + if self.stride == 1: + x1 = x[:, :(x.shape[1]//2), :, :, :] + x2 = x[:, (x.shape[1]//2):, :, :, :] + out = self._concat(x1, self.banch2(x2)) + elif self.stride == 2: + out = self._concat(self.banch1(x), self.banch2(x)) + + return channel_shuffle(out, 2) + + +# ShuffleNet-v2 +class ShuffleNetV2(nn.Module): + def __init__(self, width_mult='1.0x', num_classes=600): + super(ShuffleNetV2, self).__init__() + + self.stage_repeats = [4, 8, 4] + # index 0 is invalid and should never be called. + # only used for indexing convenience. + if width_mult == '0.25x': + self.stage_out_channels = [-1, 24, 32, 64, 128] + elif width_mult == '0.5x': + self.stage_out_channels = [-1, 24, 48, 96, 192] + elif width_mult == '1.0x': + self.stage_out_channels = [-1, 24, 116, 232, 464] + elif width_mult == '1.5x': + self.stage_out_channels = [-1, 24, 176, 352, 704] + elif width_mult == '2.0x': + self.stage_out_channels = [-1, 24, 224, 488, 976] + + # building first layer + input_channel = self.stage_out_channels[1] + self.conv1 = conv_bn(3, input_channel, stride=(1,2,2)) + self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1) + + self.features = [] + # building inverted residual blocks + for idxstage in range(len(self.stage_repeats)): + numrepeat = self.stage_repeats[idxstage] + output_channel = self.stage_out_channels[idxstage+2] + for i in range(numrepeat): + stride = 2 if i == 0 else 1 + self.features.append(InvertedResidual(input_channel, output_channel, stride)) + input_channel = output_channel + + # make it nn.Sequential + self.features = nn.Sequential(*self.features) + + # # building last several layers + # self.conv_last = conv_1x1x1_bn(input_channel, self.stage_out_channels[-1]) + # self.avgpool = nn.AvgPool3d((2, 1, 1), stride=1) + + + def forward(self, x): + x = self.conv1(x) + x = self.maxpool(x) + x = self.features(x) + # out = self.conv_last(out) + + if x.size(2) > 1: + x = torch.mean(x, dim=2, keepdim=True) + + return x.squeeze(2) + + +def load_weight(model, arch): + print('Loading pretrained weight ...') + url = model_urls[arch] + # check + if url is None: + print('No pretrained weight for 3D CNN: {}'.format(arch.upper())) + return model + + print('Loading 3D backbone pretrained weight: {}'.format(arch.upper())) + # checkpoint state dict + checkpoint = load_state_dict_from_url(url=url, map_location="cpu", check_hash=True) + checkpoint_state_dict = checkpoint.pop('state_dict') + + # model state dict + model_state_dict = model.state_dict() + # reformat checkpoint_state_dict: + new_state_dict = {} + for k in checkpoint_state_dict.keys(): + v = checkpoint_state_dict[k] + new_state_dict[k[7:]] = v + + # check + for k in list(new_state_dict.keys()): + if k in model_state_dict: + shape_model = tuple(model_state_dict[k].shape) + shape_checkpoint = tuple(new_state_dict[k].shape) + if shape_model != shape_checkpoint: + new_state_dict.pop(k) + print(k) + else: + new_state_dict.pop(k) + print(k) + + model.load_state_dict(new_state_dict) + + return model + + +# build 3D shufflenet_v2 +def build_shufflenetv2_3d(model_size='0.25x', pretrained=False): + model = ShuffleNetV2(model_size) + feats = model.stage_out_channels[-1] + + if pretrained: + model = load_weight(model, model_size) + + return model, feats + + +if __name__ == '__main__': + import time + model, feat = build_shufflenetv2_3d(model_size='1.0x', pretrained=True) + if torch.cuda.is_available(): + device = torch.device("cuda") + else: + device = torch.device("cpu") + model = model.to(device) + + # [B, C, T, H, W] + x = torch.randn(1, 3, 16, 64, 64).to(device) + # star time + t0 = time.time() + out = model(x) + print('time', time.time() - t0) + print(out.shape) diff --git a/models/basic/__init__.py b/models/basic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..14f866567fda7c53083266f1e7c2a9a74ab9dbb6 --- /dev/null +++ b/models/basic/__init__.py @@ -0,0 +1 @@ +from .conv import Conv2d \ No newline at end of file diff --git a/models/basic/__pycache__/__init__.cpython-310.pyc b/models/basic/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32bb10569406e5bd8d32b3a5e21106d96a376d04 Binary files /dev/null and b/models/basic/__pycache__/__init__.cpython-310.pyc differ diff --git a/models/basic/__pycache__/__init__.cpython-37.pyc b/models/basic/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b8ad20d512de7feac15e88d606294367eb341bbe Binary files /dev/null and b/models/basic/__pycache__/__init__.cpython-37.pyc differ diff --git a/models/basic/__pycache__/conv.cpython-310.pyc b/models/basic/__pycache__/conv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8d4b932f0e09bcc54e879b7f9cfb6906a22a2bb Binary files /dev/null and b/models/basic/__pycache__/conv.cpython-310.pyc differ diff --git a/models/basic/__pycache__/conv.cpython-37.pyc b/models/basic/__pycache__/conv.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5817a8431c782284d6b2878fe7814d0ec8ab033c Binary files /dev/null and b/models/basic/__pycache__/conv.cpython-37.pyc differ diff --git a/models/basic/conv.py b/models/basic/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..00e986cf98b73d96c45be7dd3ab1eafca8fb504f --- /dev/null +++ b/models/basic/conv.py @@ -0,0 +1,127 @@ +import torch.nn as nn + + +def get_activation(act_type=None): + if act_type == 'relu': + return nn.ReLU(inplace=True) + elif act_type == 'lrelu': + return nn.LeakyReLU(0.1, inplace=True) + elif act_type == 'mish': + return nn.Mish(inplace=True) + elif act_type == 'silu': + return nn.SiLU(inplace=True) + + +# 2D Conv +def get_conv2d(c1, c2, k, p, s, d, g, bias=False): + conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias) + return conv + + +def get_norm2d(norm_type, dim): + if norm_type == 'BN': + return nn.BatchNorm2d(dim) + elif norm_type == 'IN': + return nn.InstanceNorm2d(dim) + + +class Conv2d(nn.Module): + def __init__(self, + c1, # in channels + c2, # out channels + k=1, # kernel size + p=0, # padding + s=1, # padding + d=1, # dilation + g=1, + act_type='', # activation + norm_type='', # normalization + depthwise=False): + super(Conv2d, self).__init__() + convs = [] + add_bias = False if norm_type else True + if depthwise: + assert c1 == c2, "In depthwise conv, the in_dim (c1) should be equal to out_dim (c2)." + convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=c1, bias=add_bias)) + # depthwise conv + if norm_type: + convs.append(get_norm2d(norm_type, c2)) + if act_type: + convs.append(get_activation(act_type)) + # pointwise conv + convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias)) + if norm_type: + convs.append(get_norm2d(norm_type, c2)) + if act_type: + convs.append(get_activation(act_type)) + + else: + convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=g, bias=add_bias)) + if norm_type: + convs.append(get_norm2d(norm_type, c2)) + if act_type: + convs.append(get_activation(act_type)) + + self.convs = nn.Sequential(*convs) + + + def forward(self, x): + return self.convs(x) + + +# 3D Conv +def get_conv3d(c1, c2, k, p, s, d, g, bias=False): + conv = nn.Conv3d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias) + return conv + + +def get_norm3d(norm_type, dim): + if norm_type == 'BN': + return nn.BatchNorm3d(dim) + elif norm_type == 'IN': + return nn.InstanceNorm3d(dim) + + +class Conv3d(nn.Module): + def __init__(self, + c1, # in channels + c2, # out channels + k=1, # kernel size + p=0, # padding + s=1, # padding + d=1, # dilation + g=1, + act_type='', # activation + norm_type='', # normalization + depthwise=False): + super(Conv3d, self).__init__() + convs = [] + add_bias = False if norm_type else True + if depthwise: + assert c1 == c2, "In depthwise conv, the in_dim (c1) should be equal to out_dim (c2)." + convs.append(get_conv3d(c1, c2, k=k, p=p, s=s, d=d, g=c1, bias=add_bias)) + # depthwise conv + if norm_type: + convs.append(get_norm3d(norm_type, c2)) + if act_type: + convs.append(get_activation(act_type)) + # pointwise conv + convs.append(get_conv3d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias)) + if norm_type: + convs.append(get_norm3d(norm_type, c2)) + if act_type: + convs.append(get_activation(act_type)) + + else: + convs.append(get_conv3d(c1, c2, k=k, p=p, s=s, d=d, g=g, bias=add_bias)) + if norm_type: + convs.append(get_norm3d(norm_type, c2)) + if act_type: + convs.append(get_activation(act_type)) + + self.convs = nn.Sequential(*convs) + + + def forward(self, x): + return self.convs(x) + diff --git a/models/yowo/__pycache__/build.cpython-310.pyc b/models/yowo/__pycache__/build.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..060078f0f6beaf4017c163aa10490ad3ae9cf8e5 Binary files /dev/null and b/models/yowo/__pycache__/build.cpython-310.pyc differ diff --git a/models/yowo/__pycache__/build.cpython-37.pyc b/models/yowo/__pycache__/build.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45973bc52ea3d9e8c590465b737f767174a71077 Binary files /dev/null and b/models/yowo/__pycache__/build.cpython-37.pyc differ diff --git a/models/yowo/__pycache__/encoder.cpython-310.pyc b/models/yowo/__pycache__/encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ca7e31bbb225f01fec41e628bdf79e14aba8db6 Binary files /dev/null and b/models/yowo/__pycache__/encoder.cpython-310.pyc differ diff --git a/models/yowo/__pycache__/encoder.cpython-37.pyc b/models/yowo/__pycache__/encoder.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d47e6ccec51018ba1614119490b047f3410c5f47 Binary files /dev/null and b/models/yowo/__pycache__/encoder.cpython-37.pyc differ diff --git a/models/yowo/__pycache__/head.cpython-310.pyc b/models/yowo/__pycache__/head.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdfafc9073d217e1f9777be26acfba135d77dc9b Binary files /dev/null and b/models/yowo/__pycache__/head.cpython-310.pyc differ diff --git a/models/yowo/__pycache__/head.cpython-37.pyc b/models/yowo/__pycache__/head.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c0883b5046830b2e1acd5d842dbc8e10b5566a4 Binary files /dev/null and b/models/yowo/__pycache__/head.cpython-37.pyc differ diff --git a/models/yowo/__pycache__/loss.cpython-310.pyc b/models/yowo/__pycache__/loss.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc2651060da37eec464c571bfa58c7d53c086989 Binary files /dev/null and b/models/yowo/__pycache__/loss.cpython-310.pyc differ diff --git a/models/yowo/__pycache__/loss.cpython-37.pyc b/models/yowo/__pycache__/loss.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff793654d79f2f98a581003f79ea34c7e9d24fa9 Binary files /dev/null and b/models/yowo/__pycache__/loss.cpython-37.pyc differ diff --git a/models/yowo/__pycache__/matcher.cpython-310.pyc b/models/yowo/__pycache__/matcher.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c9b248c3083093b5bb13b8eb1a3a17d903eb039 Binary files /dev/null and b/models/yowo/__pycache__/matcher.cpython-310.pyc differ diff --git a/models/yowo/__pycache__/matcher.cpython-37.pyc b/models/yowo/__pycache__/matcher.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8eff850613dab62b38c1d802430f18cf4957f463 Binary files /dev/null and b/models/yowo/__pycache__/matcher.cpython-37.pyc differ diff --git a/models/yowo/__pycache__/yowo.cpython-310.pyc b/models/yowo/__pycache__/yowo.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bea8714905383efa61d3db4233e6379a610dc789 Binary files /dev/null and b/models/yowo/__pycache__/yowo.cpython-310.pyc differ diff --git a/models/yowo/__pycache__/yowo.cpython-37.pyc b/models/yowo/__pycache__/yowo.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea61b65f2ed43987ba745df0be27a66424c92d9d Binary files /dev/null and b/models/yowo/__pycache__/yowo.cpython-37.pyc differ diff --git a/models/yowo/build.py b/models/yowo/build.py new file mode 100644 index 0000000000000000000000000000000000000000..03f9a3138cd81fc7879ed83f3173a61194fcb765 --- /dev/null +++ b/models/yowo/build.py @@ -0,0 +1,55 @@ +import torch +from .yowo import YOWO +from .loss import build_criterion + + +# build YOWO detector +def build_yowo(args, + d_cfg, + m_cfg, + device, + num_classes=3, + trainable=False, + resume=None): + print('==============================') + print('Build {} ...'.format(args.version.upper())) + + # build YOWO + model = YOWO( + cfg = m_cfg, + device = device, + num_classes = num_classes, + conf_thresh = 0.15, + nms_thresh = 0.5, + topk = 40, + trainable = trainable, + multi_hot = d_cfg['multi_hot'], + ) + + if trainable: + # Freeze backbone + if args.freeze_backbone_2d: + print('Freeze 2D Backbone ...') + for m in model.backbone_2d.parameters(): + m.requires_grad = False + if args.freeze_backbone_3d: + print('Freeze 3D Backbone ...') + for m in model.backbone_3d.parameters(): + m.requires_grad = False + + # keep training + if resume is not None: + print('keep training: ', resume) + checkpoint = torch.load(resume, map_location='cpu') + # checkpoint state dict + checkpoint_state_dict = checkpoint.pop("model") + model.load_state_dict(checkpoint_state_dict) + + # build criterion + criterion = build_criterion( + args, d_cfg['train_size'], num_classes, d_cfg['multi_hot']) + + else: + criterion = None + + return model, criterion diff --git a/models/yowo/encoder.py b/models/yowo/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..5924850e733b6ad7b748903a0bca4e2f2172e4a0 --- /dev/null +++ b/models/yowo/encoder.py @@ -0,0 +1,147 @@ +import torch +import torch.nn as nn +from ..basic.conv import Conv2d + + +# Channel Self Attetion Module +class CSAM(nn.Module): + """ Channel attention module """ + def __init__(self): + super(CSAM, self).__init__() + self.gamma = nn.Parameter(torch.zeros(1)) + self.softmax = nn.Softmax(dim=-1) + + + def forward(self, x): + """ + inputs : + x : input feature maps( B x C x H x W ) + returns : + out : attention value + input feature + attention: B x C x C + """ + B, C, H, W = x.size() + # query / key / value + query = x.view(B, C, -1) + key = x.view(B, C, -1).permute(0, 2, 1) + value = x.view(B, C, -1) + + # attention matrix + energy = torch.bmm(query, key) + energy_new = torch.max(energy, -1, keepdim=True)[0].expand_as(energy) - energy + attention = self.softmax(energy_new) + + # attention + out = torch.bmm(attention, value) + out = out.view(B, C, H, W) + + # output + out = self.gamma * out + x + + return out + + +# Spatial Self Attetion Module +class SSAM(nn.Module): + """ Channel attention module """ + def __init__(self): + super(SSAM, self).__init__() + self.gamma = nn.Parameter(torch.zeros(1)) + self.softmax = nn.Softmax(dim=-1) + + + def forward(self, x): + """ + inputs : + x : input feature maps( B x C x H x W ) + returns : + out : attention value + input feature + attention: B x C x C + """ + B, C, H, W = x.size() + # query / key / value + query = x.view(B, C, -1).permute(0, 2, 1) # [B, N, C] + key = x.view(B, C, -1) # [B, C, N] + value = x.view(B, C, -1).permute(0, 2, 1) # [B, N, C] + + # attention matrix + energy = torch.bmm(query, key) + energy_new = torch.max(energy, -1, keepdim=True)[0].expand_as(energy) - energy + attention = self.softmax(energy_new) + + # attention + out = torch.bmm(attention, value) + out = out.permute(0, 2, 1).contiguous().view(B, C, H, W) + + # output + out = self.gamma * out + x + + return out + + +# Channel Encoder +class ChannelEncoder(nn.Module): + def __init__(self, in_dim, out_dim, act_type='', norm_type=''): + super().__init__() + self.fuse_convs = nn.Sequential( + Conv2d(in_dim, out_dim, k=1, act_type=act_type, norm_type=norm_type), + Conv2d(out_dim, out_dim, k=3, p=1, act_type=act_type, norm_type=norm_type), + CSAM(), + Conv2d(out_dim, out_dim, k=3, p=1, act_type=act_type, norm_type=norm_type), + nn.Dropout(0.1, inplace=False), + nn.Conv2d(out_dim, out_dim, kernel_size=1) + ) + + def forward(self, x1, x2): + """ + x: [B, C, H, W] + """ + x = torch.cat([x1, x2], dim=1) + # [B, CN, H, W] -> [B, C, H, W] + x = self.fuse_convs(x) + + return x + + +# Spatial Encoder +class SpatialEncoder(nn.Module): + def __init__(self, in_dim, out_dim, act_type='', norm_type=''): + super().__init__() + self.fuse_convs = nn.Sequential( + Conv2d(in_dim, out_dim, k=1, act_type=act_type, norm_type=norm_type), + Conv2d(out_dim, out_dim, k=3, p=1, act_type=act_type, norm_type=norm_type), + SSAM(), + Conv2d(out_dim, out_dim, k=3, p=1, act_type=act_type, norm_type=norm_type), + nn.Dropout(0.1, inplace=False), + nn.Conv2d(out_dim, out_dim, kernel_size=1) + ) + + def forward(self, x): + """ + x: [B, C, H, W] + """ + x = self.fuse_convs(x) + + return x + + +def build_channel_encoder(cfg, in_dim, out_dim): + encoder = ChannelEncoder( + in_dim=in_dim, + out_dim=out_dim, + act_type=cfg['head_act'], + norm_type=cfg['head_norm'] + ) + + return encoder + + +def build_spatial_encoder(cfg, in_dim, out_dim): + encoder = SpatialEncoder( + in_dim=in_dim, + out_dim=out_dim, + act_type=cfg['head_act'], + norm_type=cfg['head_norm'] + ) + + return encoder diff --git a/models/yowo/head.py b/models/yowo/head.py new file mode 100644 index 0000000000000000000000000000000000000000..893a6c64b3ace6f8635fc58fbcca9dd80770018f --- /dev/null +++ b/models/yowo/head.py @@ -0,0 +1,47 @@ +import torch +import torch.nn as nn + +from ..basic.conv import Conv2d + + +class DecoupledHead(nn.Module): + def __init__(self, cfg): + super().__init__() + + print('==============================') + print('Head: Decoupled Head') + self.num_cls_heads = cfg['num_cls_heads'] + self.num_reg_heads = cfg['num_reg_heads'] + self.act_type = cfg['head_act'] + self.norm_type = cfg['head_norm'] + self.head_dim = cfg['head_dim'] + self.depthwise = cfg['head_depthwise'] + + self.cls_head = nn.Sequential(*[ + Conv2d(self.head_dim, + self.head_dim, + k=3, p=1, s=1, + act_type=self.act_type, + norm_type=self.norm_type, + depthwise=self.depthwise) + for _ in range(self.num_cls_heads)]) + self.reg_head = nn.Sequential(*[ + Conv2d(self.head_dim, + self.head_dim, + k=3, p=1, s=1, + act_type=self.act_type, + norm_type=self.norm_type, + depthwise=self.depthwise) + for _ in range(self.num_reg_heads)]) + + + def forward(self, cls_feat, reg_feat): + cls_feats = self.cls_head(cls_feat) + reg_feats = self.reg_head(reg_feat) + + return cls_feats, reg_feats + + +def build_head(cfg): + return DecoupledHead(cfg) + \ No newline at end of file diff --git a/models/yowo/loss.py b/models/yowo/loss.py new file mode 100644 index 0000000000000000000000000000000000000000..4833008d2d0ca719f90d846156686972c8a00236 --- /dev/null +++ b/models/yowo/loss.py @@ -0,0 +1,173 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from .matcher import SimOTA +from utils.box_ops import get_ious +from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized + + +class SigmoidFocalLoss(object): + def __init__(self, alpha=0.25, gamma=2.0, reduction='none'): + self.alpha = alpha + self.gamma = gamma + self.reduction = reduction + + def __call__(self, logits, targets): + p = torch.sigmoid(logits) + ce_loss = F.binary_cross_entropy_with_logits(input=logits, + target=targets, + reduction="none") + p_t = p * targets + (1.0 - p) * (1.0 - targets) + loss = ce_loss * ((1.0 - p_t) ** self.gamma) + + if self.alpha >= 0: + alpha_t = self.alpha * targets + (1.0 - self.alpha) * (1.0 - targets) + loss = alpha_t * loss + + if self.reduction == "mean": + loss = loss.mean() + + elif self.reduction == "sum": + loss = loss.sum() + + return loss + + +class Criterion(object): + def __init__(self, args, img_size, num_classes=3, multi_hot=False): + self.num_classes = num_classes + self.img_size = img_size + self.loss_conf_weight = args.loss_conf_weight + self.loss_cls_weight = args.loss_cls_weight + self.loss_reg_weight = args.loss_reg_weight + self.focal_loss = args.focal_loss + self.multi_hot = multi_hot + + # loss + self.obj_lossf = nn.BCEWithLogitsLoss(reduction='none') + self.cls_lossf = nn.BCEWithLogitsLoss(reduction='none') + + # matcher + self.matcher = SimOTA( + num_classes=num_classes, + center_sampling_radius=args.center_sampling_radius, + topk_candidate=args.topk_candicate + ) + + def __call__(self, outputs, targets): + """ + outputs['pred_conf']: List(Tensor) [B, M, 1] + outputs['pred_cls']: List(Tensor) [B, M, C] + outputs['pred_box']: List(Tensor) [B, M, 4] + outputs['strides']: List(Int) [8, 16, 32] output stride + targets: (List) [dict{'boxes': [...], + 'labels': [...], + 'orig_size': ...}, ...] + """ + bs = outputs['pred_cls'][0].shape[0] + device = outputs['pred_cls'][0].device + fpn_strides = outputs['strides'] + anchors = outputs['anchors'] + # preds: [B, M, C] + conf_preds = torch.cat(outputs['pred_conf'], dim=1) + cls_preds = torch.cat(outputs['pred_cls'], dim=1) + box_preds = torch.cat(outputs['pred_box'], dim=1) + + # label assignment + cls_targets = [] + box_targets = [] + conf_targets = [] + fg_masks = [] + + for batch_idx in range(bs): + tgt_labels = targets[batch_idx]["labels"].to(device) + tgt_bboxes = targets[batch_idx]["boxes"].to(device) + + # denormalize tgt_bbox + tgt_bboxes *= self.img_size + + # check target + if len(tgt_labels) == 0 or tgt_bboxes.max().item() == 0.: + num_anchors = sum([ab.shape[0] for ab in anchors]) + # There is no valid gt + cls_target = conf_preds.new_zeros((0, self.num_classes)) + box_target = conf_preds.new_zeros((0, 4)) + conf_target = conf_preds.new_zeros((num_anchors, 1)) + fg_mask = conf_preds.new_zeros(num_anchors).bool() + else: + ( + gt_matched_classes, + fg_mask, + pred_ious_this_matching, + matched_gt_inds, + num_fg_img, + ) = self.matcher( + fpn_strides = fpn_strides, + anchors = anchors, + pred_conf = conf_preds[batch_idx], + pred_cls = cls_preds[batch_idx], + pred_box = box_preds[batch_idx], + tgt_labels = tgt_labels, + tgt_bboxes = tgt_bboxes, + ) + + conf_target = fg_mask.unsqueeze(-1) + box_target = tgt_bboxes[matched_gt_inds] + if self.multi_hot: + cls_target = gt_matched_classes.float() + else: + cls_target = F.one_hot(gt_matched_classes.long(), self.num_classes) + cls_target = cls_target * pred_ious_this_matching.unsqueeze(-1) + + cls_targets.append(cls_target) + box_targets.append(box_target) + conf_targets.append(conf_target) + fg_masks.append(fg_mask) + + cls_targets = torch.cat(cls_targets, 0) + box_targets = torch.cat(box_targets, 0) + conf_targets = torch.cat(conf_targets, 0) + fg_masks = torch.cat(fg_masks, 0) + num_foregrounds = fg_masks.sum() + + if is_dist_avail_and_initialized(): + torch.distributed.all_reduce(num_foregrounds) + num_foregrounds = (num_foregrounds / get_world_size()).clamp(1.0) + + # conf loss + loss_conf = self.obj_lossf(conf_preds.view(-1, 1), conf_targets.float()) + loss_conf = loss_conf.sum() / num_foregrounds + + # cls loss + matched_cls_preds = cls_preds.view(-1, self.num_classes)[fg_masks] + loss_cls = self.cls_lossf(matched_cls_preds, cls_targets) + loss_cls = loss_cls.sum() / num_foregrounds + + # box loss + matched_box_preds = box_preds.view(-1, 4)[fg_masks] + ious = get_ious(matched_box_preds, + box_targets, + box_mode="xyxy", + iou_type='giou') + loss_box = (1.0 - ious).sum() / num_foregrounds + + # total loss + losses = self.loss_conf_weight * loss_conf + \ + self.loss_cls_weight * loss_cls + \ + self.loss_reg_weight * loss_box + + loss_dict = dict( + loss_conf = loss_conf, + loss_cls = loss_cls, + loss_box = loss_box, + losses = losses + ) + + return loss_dict + + +def build_criterion(args, img_size, num_classes, multi_hot=False): + criterion = Criterion(args, img_size, num_classes, multi_hot) + + return criterion + \ No newline at end of file diff --git a/models/yowo/matcher.py b/models/yowo/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..9a2b6fcf4efae99be7833c6c0958f55d5f52dfaf --- /dev/null +++ b/models/yowo/matcher.py @@ -0,0 +1,201 @@ +import torch +import torch.nn.functional as F +from utils.box_ops import * + + + +# SimOTA +class SimOTA(object): + def __init__(self, num_classes, center_sampling_radius, topk_candidate): + self.num_classes = num_classes + self.center_sampling_radius = center_sampling_radius + self.topk_candidate = topk_candidate + + + @torch.no_grad() + def __call__(self, + fpn_strides, + anchors, + pred_conf, + pred_cls, + pred_box, + tgt_labels, + tgt_bboxes): + # [M,] + strides = torch.cat([torch.ones_like(anchor_i[:, 0]) * stride_i + for stride_i, anchor_i in zip(fpn_strides, anchors)], dim=-1) + # List[F, M, 2] -> [M, 2] + anchors = torch.cat(anchors, dim=0) + num_anchor = anchors.shape[0] + num_gt = len(tgt_labels) + + # positive candidates + fg_mask, is_in_boxes_and_center = \ + self.get_in_boxes_info( + tgt_bboxes, + anchors, + strides, + num_anchor, + num_gt + ) + + conf_preds_ = pred_conf[fg_mask] # [Mp, 1] + cls_preds_ = pred_cls[fg_mask] # [Mp, C] + box_preds_ = pred_box[fg_mask] # [Mp, 4] + num_in_boxes_anchor = box_preds_.shape[0] + + # [N, Mp] + pair_wise_ious, _ = box_iou(tgt_bboxes, box_preds_) + pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8) + + if len(tgt_labels.shape) == 1: + gt_cls = F.one_hot(tgt_labels.long(), self.num_classes) + elif len(tgt_labels.shape) == 2: + gt_cls = tgt_labels + + # [N, C] -> [N, Mp, C] + gt_cls = gt_cls.float().unsqueeze(1).repeat(1, num_in_boxes_anchor, 1) + + with torch.cuda.amp.autocast(enabled=False): + score_preds_ = torch.sqrt( + cls_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() + * conf_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_() + ) # [N, Mp, C] + pair_wise_cls_loss = F.binary_cross_entropy( + score_preds_, gt_cls, reduction="none" + ).sum(-1) # [N, Mp] + del score_preds_ + + cost = ( + pair_wise_cls_loss + + 3.0 * pair_wise_ious_loss + + 100000.0 * (~is_in_boxes_and_center) + ) # [N, Mp] + + ( + num_fg, + gt_matched_classes, # [num_fg,] + pred_ious_this_matching, # [num_fg,] + matched_gt_inds, # [num_fg,] + ) = self.dynamic_k_matching( + cost, + pair_wise_ious, + tgt_labels, + num_gt, + fg_mask + ) + del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss + + return ( + gt_matched_classes, + fg_mask, + pred_ious_this_matching, + matched_gt_inds, + num_fg, + ) + + + def get_in_boxes_info( + self, + gt_bboxes, # [N, 4] + anchors, # [M, 2] + strides, # [M,] + num_anchors, # M + num_gt, # N + ): + # anchor center + x_centers = anchors[:, 0] + y_centers = anchors[:, 1] + + # [M,] -> [1, M] -> [N, M] + x_centers = x_centers.unsqueeze(0).repeat(num_gt, 1) + y_centers = y_centers.unsqueeze(0).repeat(num_gt, 1) + + # [N,] -> [N, 1] -> [N, M] + gt_bboxes_l = gt_bboxes[:, 0].unsqueeze(1).repeat(1, num_anchors) # x1 + gt_bboxes_t = gt_bboxes[:, 1].unsqueeze(1).repeat(1, num_anchors) # y1 + gt_bboxes_r = gt_bboxes[:, 2].unsqueeze(1).repeat(1, num_anchors) # x2 + gt_bboxes_b = gt_bboxes[:, 3].unsqueeze(1).repeat(1, num_anchors) # y2 + + b_l = x_centers - gt_bboxes_l + b_r = gt_bboxes_r - x_centers + b_t = y_centers - gt_bboxes_t + b_b = gt_bboxes_b - y_centers + bbox_deltas = torch.stack([b_l, b_t, b_r, b_b], 2) + + is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0 + is_in_boxes_all = is_in_boxes.sum(dim=0) > 0 + # in fixed center + center_radius = self.center_sampling_radius + + # [N, 2] + gt_centers = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) * 0.5 + + # [1, M] + center_radius_ = center_radius * strides.unsqueeze(0) + + gt_bboxes_l = gt_centers[:, 0].unsqueeze(1).repeat(1, num_anchors) - center_radius_ # x1 + gt_bboxes_t = gt_centers[:, 1].unsqueeze(1).repeat(1, num_anchors) - center_radius_ # y1 + gt_bboxes_r = gt_centers[:, 0].unsqueeze(1).repeat(1, num_anchors) + center_radius_ # x2 + gt_bboxes_b = gt_centers[:, 1].unsqueeze(1).repeat(1, num_anchors) + center_radius_ # y2 + + c_l = x_centers - gt_bboxes_l + c_r = gt_bboxes_r - x_centers + c_t = y_centers - gt_bboxes_t + c_b = gt_bboxes_b - y_centers + center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2) + is_in_centers = center_deltas.min(dim=-1).values > 0.0 + is_in_centers_all = is_in_centers.sum(dim=0) > 0 + + # in boxes and in centers + is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all + + is_in_boxes_and_center = ( + is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor] + ) + return is_in_boxes_anchor, is_in_boxes_and_center + + + def dynamic_k_matching( + self, + cost, + pair_wise_ious, + gt_classes, + num_gt, + fg_mask + ): + # Dynamic K + # --------------------------------------------------------------- + matching_matrix = torch.zeros_like(cost, dtype=torch.uint8) + + ious_in_boxes_matrix = pair_wise_ious + n_candidate_k = min(self.topk_candidate, ious_in_boxes_matrix.size(1)) + topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1) + dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1) + dynamic_ks = dynamic_ks.tolist() + for gt_idx in range(num_gt): + _, pos_idx = torch.topk( + cost[gt_idx], k=dynamic_ks[gt_idx], largest=False + ) + matching_matrix[gt_idx][pos_idx] = 1 + + del topk_ious, dynamic_ks, pos_idx + + anchor_matching_gt = matching_matrix.sum(0) + if (anchor_matching_gt > 1).sum() > 0: + _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0) + matching_matrix[:, anchor_matching_gt > 1] *= 0 + matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1 + fg_mask_inboxes = matching_matrix.sum(0) > 0 + num_fg = fg_mask_inboxes.sum().item() + + fg_mask[fg_mask.clone()] = fg_mask_inboxes + + matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0) + gt_matched_classes = gt_classes[matched_gt_inds] + + pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[ + fg_mask_inboxes + ] + return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds + \ No newline at end of file diff --git a/models/yowo/yowo.py b/models/yowo/yowo.py new file mode 100644 index 0000000000000000000000000000000000000000..b7f158c97588c6d4e0b67d78355619f2ba7e5e94 --- /dev/null +++ b/models/yowo/yowo.py @@ -0,0 +1,481 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..backbone import build_backbone_2d +from ..backbone import build_backbone_3d +from .encoder import build_channel_encoder +from .head import build_head + +from utils.nms import multiclass_nms + +import time +import multiprocessing +import concurrent.futures +import torch.multiprocessing as mp +from multiprocessing import Queue, Process +import threading + +# class ModelThread(threading.Thread): +# def __init__(self, model, *args, **kwargs): +# super(ModelThread, self).__init__() +# self.model = model +# self.args = args +# self.kwargs = kwargs +# self.result = None + +# def run(self): +# self.result = self.model(*self.args, **self.kwargs) +# You Only Watch Once +class YOWO(nn.Module): + def __init__(self, + cfg, + device, + num_classes = 3, + conf_thresh = 0.05, + nms_thresh = 0.6, + topk = 40, + trainable = False, + multi_hot = False): + super(YOWO, self).__init__() + self.cfg = cfg + self.device = device + self.stride = cfg['stride'] + self.num_classes = num_classes + self.trainable = trainable + self.conf_thresh = conf_thresh + self.nms_thresh = nms_thresh + self.topk = topk + self.multi_hot = multi_hot + + # ------------------ Network --------------------- + ## 2D backbone + self.backbone_2d, bk_dim_2d = build_backbone_2d( + cfg, pretrained=cfg['pretrained_2d'] and trainable) + + ## 3D backbone + self.backbone_3d, bk_dim_3d = build_backbone_3d( + cfg, pretrained=cfg['pretrained_3d'] and trainable) + + ## cls channel encoder + self.cls_channel_encoders = nn.ModuleList( + [build_channel_encoder(cfg, bk_dim_2d[i]+bk_dim_3d, cfg['head_dim']) + for i in range(len(cfg['stride']))]) + + ## reg channel & spatial encoder + self.reg_channel_encoders = nn.ModuleList( + [build_channel_encoder(cfg, bk_dim_2d[i]+bk_dim_3d, cfg['head_dim']) + for i in range(len(cfg['stride']))]) + + ## head + self.heads = nn.ModuleList( + [build_head(cfg) for _ in range(len(cfg['stride']))] + ) + + ## pred + head_dim = cfg['head_dim'] + self.conf_preds = nn.ModuleList( + [nn.Conv2d(head_dim, 1, kernel_size=1) + for _ in range(len(cfg['stride'])) + ]) + self.cls_preds = nn.ModuleList( + [nn.Conv2d(head_dim, self.num_classes, kernel_size=1) + for _ in range(len(cfg['stride'])) + ]) + self.reg_preds = nn.ModuleList( + [nn.Conv2d(head_dim, 4, kernel_size=1) + for _ in range(len(cfg['stride'])) + ]) + + # init yowo + self.init_yowo() + + + + def init_yowo(self): + # Init yolo + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eps = 1e-3 + m.momentum = 0.03 + + # Init bias + init_prob = 0.01 + bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) + # obj pred + for conf_pred in self.conf_preds: + b = conf_pred.bias.view(1, -1) + b.data.fill_(bias_value.item()) + conf_pred.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + # cls pred + for cls_pred in self.cls_preds: + b = cls_pred.bias.view(1, -1) + b.data.fill_(bias_value.item()) + cls_pred.bias = torch.nn.Parameter(b.view(-1), requires_grad=True) + + + def generate_anchors(self, fmp_size, stride): + """ + fmp_size: (List) [H, W] + """ + # generate grid cells + fmp_h, fmp_w = fmp_size + anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)]) + # [H, W, 2] -> [HW, 2] + anchor_xy = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2) + 0.5 + anchor_xy *= stride + anchors = anchor_xy.to(self.device) + + return anchors + + + def decode_boxes(self, anchors, pred_reg, stride): + """ + anchors: (List[Tensor]) [1, M, 2] or [M, 2] + pred_reg: (List[Tensor]) [B, M, 4] or [B, M, 4] + """ + # center of bbox + pred_ctr_xy = anchors + pred_reg[..., :2] * stride + # size of bbox + pred_box_wh = pred_reg[..., 2:].exp() * stride + + pred_x1y1 = pred_ctr_xy - 0.5 * pred_box_wh + pred_x2y2 = pred_ctr_xy + 0.5 * pred_box_wh + pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1) + + return pred_box + + + def post_process_one_hot(self, conf_preds, cls_preds, reg_preds, anchors): + """ + Input: + conf_preds: (Tensor) [H x W, 1] + cls_preds: (Tensor) [H x W, C] + reg_preds: (Tensor) [H x W, 4] + """ + + all_scores = [] + all_labels = [] + all_bboxes = [] + + for level, (conf_pred_i, cls_pred_i, reg_pred_i, anchors_i) in enumerate(zip(conf_preds, cls_preds, reg_preds, anchors)): + # (H x W x C,) + scores_i = (torch.sqrt(conf_pred_i.sigmoid() * cls_pred_i.sigmoid())).flatten() + + # Keep top k top scoring indices only. + num_topk = min(self.topk, reg_pred_i.size(0)) + + # torch.sort is actually faster than .topk (at least on GPUs) + predicted_prob, topk_idxs = scores_i.sort(descending=True) + topk_scores = predicted_prob[:num_topk] + topk_idxs = topk_idxs[:num_topk] + + # filter out the proposals with low confidence score + keep_idxs = topk_scores > self.conf_thresh + scores = topk_scores[keep_idxs] + topk_idxs = topk_idxs[keep_idxs] + + anchor_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor') + labels = topk_idxs % self.num_classes + + reg_pred_i = reg_pred_i[anchor_idxs] + anchors_i = anchors_i[anchor_idxs] + + # decode box: [M, 4] + bboxes = self.decode_boxes(anchors_i, reg_pred_i, self.stride[level]) + + all_scores.append(scores) + all_labels.append(labels) + all_bboxes.append(bboxes) + + scores = torch.cat(all_scores) + labels = torch.cat(all_labels) + bboxes = torch.cat(all_bboxes) + + # to cpu + scores = scores.cpu().numpy() + labels = labels.cpu().numpy() + bboxes = bboxes.cpu().numpy() + + # nms + scores, labels, bboxes = multiclass_nms( + scores, labels, bboxes, self.nms_thresh, self.num_classes, False) + + return scores, labels, bboxes + + + def post_process_multi_hot(self, conf_preds, cls_preds, reg_preds, anchors): + """ + Input: + cls_pred: (Tensor) [H x W, C] + reg_pred: (Tensor) [H x W, 4] + """ + all_conf_preds = [] + all_cls_preds = [] + all_box_preds = [] + for level, (conf_pred_i, cls_pred_i, reg_pred_i, anchors_i) in enumerate(zip(conf_preds, cls_preds, reg_preds, anchors)): + # decode box + box_pred_i = self.decode_boxes(anchors_i, reg_pred_i, self.stride[level]) + + # conf pred + conf_pred_i = torch.sigmoid(conf_pred_i.squeeze(-1)) # [M,] + + # cls_pred + cls_pred_i = torch.sigmoid(cls_pred_i) # [M, C] + + # topk + topk_conf_pred_i, topk_inds = torch.topk(conf_pred_i, self.topk) + topk_cls_pred_i = cls_pred_i[topk_inds] + topk_box_pred_i = box_pred_i[topk_inds] + + # threshold + keep = topk_conf_pred_i.gt(self.conf_thresh) + topk_conf_pred_i = topk_conf_pred_i[keep] + topk_cls_pred_i = topk_cls_pred_i[keep] + topk_box_pred_i = topk_box_pred_i[keep] + + all_conf_preds.append(topk_conf_pred_i) + all_cls_preds.append(topk_cls_pred_i) + all_box_preds.append(topk_box_pred_i) + + # concatenate + conf_preds = torch.cat(all_conf_preds, dim=0) # [M,] + cls_preds = torch.cat(all_cls_preds, dim=0) # [M, C] + box_preds = torch.cat(all_box_preds, dim=0) # [M, 4] + + # to cpu - numpy + scores = conf_preds.cpu().numpy() + labels = cls_preds.cpu().numpy() + bboxes = box_preds.cpu().numpy() + + #torch + # scores = conf_preds + # labels = cls_preds + # bboxes = box_preds + # nms + scores, labels, bboxes = multiclass_nms( + scores, labels, bboxes, self.nms_thresh, self.num_classes, True) + + # [M, 5 + C] + #numpy + out_boxes = np.concatenate([bboxes, scores[..., None], labels], axis=-1) + #torch + #out_boxes = torch.cat([bboxes, scores.unsqueeze(-1), labels], dim=-1) + + return out_boxes + + + @torch.no_grad() + # def process_2d(self, key_frame, queue): + # cls_feats, reg_feats = self.backbone_2d(key_frame) + # queue.put((cls_feats, reg_feats)) + # #return + # def process_3d(self, video_clips, queue): + # feat_3d = self.backbone_3d(video_clips) + # queue.put(feat_3d) + #return + def inference(self, video_clips): + """ + Input: + video_clips: (Tensor) -> [B, 3, T, H, W]. + return: + """ + # queue_3d = Queue() + # queue_2d = Queue() + B, _, _, img_h, img_w = video_clips.shape + + # key frame + key_frame = video_clips[:, :, -1, :, :] + # # 3D backbone + # p1 = Process(target=self.process_3d, args=(video_clips, queue_3d)) + # p2 = Process(target=self.process_2d, args=(key_frame, queue_2d)) + # p1.start() + # p2.start() + # p1.join() + # p2.join() + # feat_3d = queue_3d.get() + # cls_feats, reg_feats = queue_2d.get() + # feat_3d_thread = ModelThread(self.backbone_3d, video_clips) + # feat_2d_thread = ModelThread(self.backbone_2d, key_frame) + # feat_3d_thread.start() + # feat_2d_thread.start() + + # # Chờ cho các thread hoàn thành + # feat_3d_thread.join() + # feat_2d_thread.join() + # feat_3d = feat_3d_thread.result + # cls_feats, reg_feats = feat_2d_thread.result + feat_3d = self.backbone_3d(video_clips) + #time_3d_end = time.time() + #print("3d_time: ", time_3d_end - time_3d_start) + #2D backbone + #time_2d_start = time.time() + cls_feats, reg_feats = self.backbone_2d(key_frame) + #time_2d_end = time.time() + #print("2d_time: ", time_2d_end - time_2d_start) + #print("total_time: ", time_2d_end - time_3d_start) + # non-shared heads + all_conf_preds = [] + all_cls_preds = [] + all_reg_preds = [] + all_anchors = [] + for level, (cls_feat, reg_feat) in enumerate(zip(cls_feats, reg_feats)): + # upsample + feat_3d_up = F.interpolate(feat_3d, scale_factor=2 ** (2 - level)) + + # encoder + cls_feat = self.cls_channel_encoders[level](cls_feat, feat_3d_up) + reg_feat = self.reg_channel_encoders[level](reg_feat, feat_3d_up) + + # head + cls_feat, reg_feat = self.heads[level](cls_feat, reg_feat) + + # pred + conf_pred = self.conf_preds[level](reg_feat) + cls_pred = self.cls_preds[level](cls_feat) + reg_pred = self.reg_preds[level](reg_feat) + + # generate anchors + fmp_size = conf_pred.shape[-2:] + anchors = self.generate_anchors(fmp_size, self.stride[level]) + + # [B, C, H, W] -> [B, H, W, C] -> [B, M, C], M = HW + conf_pred = conf_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 1) + cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes) + reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 4) + + all_conf_preds.append(conf_pred) + all_cls_preds.append(cls_pred) + all_reg_preds.append(reg_pred) + all_anchors.append(anchors) + + # batch process + if self.multi_hot: + batch_bboxes = [] + for batch_idx in range(video_clips.size(0)): + cur_conf_preds = [] + cur_cls_preds = [] + cur_reg_preds = [] + for conf_preds, cls_preds, reg_preds in zip(all_conf_preds, all_cls_preds, all_reg_preds): + # [B, M, C] -> [M, C] + cur_conf_preds.append(conf_preds[batch_idx]) + cur_cls_preds.append(cls_preds[batch_idx]) + cur_reg_preds.append(reg_preds[batch_idx]) + + # post-process + out_boxes = self.post_process_multi_hot( + cur_conf_preds, cur_cls_preds, cur_reg_preds, all_anchors) + + # normalize bbox + + # max_dim = torch.max(torch.tensor([img_h, img_w])) + # out_boxes[..., :4] /= max_dim + #original + out_boxes[..., :4] /= max(img_h, img_w) + out_boxes[..., :4] = out_boxes[..., :4].clip(0., 1.) + + batch_bboxes.append(out_boxes) + + return batch_bboxes + + else: + batch_scores = [] + batch_labels = [] + batch_bboxes = [] + for batch_idx in range(conf_pred.size(0)): + # [B, M, C] -> [M, C] + cur_conf_preds = [] + cur_cls_preds = [] + cur_reg_preds = [] + for conf_preds, cls_preds, reg_preds in zip(all_conf_preds, all_cls_preds, all_reg_preds): + # [B, M, C] -> [M, C] + cur_conf_preds.append(conf_preds[batch_idx]) + cur_cls_preds.append(cls_preds[batch_idx]) + cur_reg_preds.append(reg_preds[batch_idx]) + + # post-process + scores, labels, bboxes = self.post_process_one_hot( + cur_conf_preds, cur_cls_preds, cur_reg_preds, all_anchors) + + # normalize bbox + bboxes /= max(img_h, img_w) + bboxes = bboxes.clip(0., 1.) + + batch_scores.append(scores) + batch_labels.append(labels) + batch_bboxes.append(bboxes) + + return batch_scores, batch_labels, batch_bboxes + + + def forward(self, video_clips): + """ + Input: + video_clips: (Tensor) -> [B, 3, T, H, W]. + return: + outputs: (Dict) -> { + 'pred_conf': (Tensor) [B, M, 1] + 'pred_cls': (Tensor) [B, M, C] + 'pred_reg': (Tensor) [B, M, 4] + 'anchors': (Tensor) [M, 2] + 'stride': (Int) + } + """ + if not self.trainable: + return self.inference(video_clips) + else: + # key frame + key_frame = video_clips[:, :, -1, :, :] + # 3D backbone + feat_3d = self.backbone_3d(video_clips) + + # 2D backbone + cls_feats, reg_feats = self.backbone_2d(key_frame) + + # non-shared heads + all_conf_preds = [] + all_cls_preds = [] + all_box_preds = [] + all_anchors = [] + for level, (cls_feat, reg_feat) in enumerate(zip(cls_feats, reg_feats)): + # upsample + feat_3d_up = F.interpolate(feat_3d, scale_factor=2 ** (2 - level)) + + # encoder + cls_feat = self.cls_channel_encoders[level](cls_feat, feat_3d_up) + reg_feat = self.reg_channel_encoders[level](reg_feat, feat_3d_up) + + # head + cls_feat, reg_feat = self.heads[level](cls_feat, reg_feat) + + # pred + conf_pred = self.conf_preds[level](reg_feat) + cls_pred = self.cls_preds[level](cls_feat) + reg_pred = self.reg_preds[level](reg_feat) + + # generate anchors + fmp_size = conf_pred.shape[-2:] + anchors = self.generate_anchors(fmp_size, self.stride[level]) + + # [B, C, H, W] -> [B, H, W, C] -> [B, M, C] + conf_pred = conf_pred.permute(0, 2, 3, 1).contiguous().flatten(1, 2) + cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().flatten(1, 2) + reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().flatten(1, 2) + + # decode box: [M, 4] + box_pred = self.decode_boxes(anchors, reg_pred, self.stride[level]) + + all_conf_preds.append(conf_pred) + all_cls_preds.append(cls_pred) + all_box_preds.append(box_pred) + all_anchors.append(anchors) + + # output dict + outputs = {"pred_conf": all_conf_preds, # List(Tensor) [B, M, 1] + "pred_cls": all_cls_preds, # List(Tensor) [B, M, C] + "pred_box": all_box_preds, # List(Tensor) [B, M, 4] + "anchors": all_anchors, # List(Tensor) [B, M, 2] + "strides": self.stride} # List(Int) + + return outputs diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utils/__pycache__/__init__.cpython-310.pyc b/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be4986cee2c9d797976c0b4f5717ec2f41e7c98f Binary files /dev/null and b/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/utils/__pycache__/__init__.cpython-37.pyc b/utils/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f34e182c15839eb83f82be7c0dc11748c9ca9d2 Binary files /dev/null and b/utils/__pycache__/__init__.cpython-37.pyc differ diff --git a/utils/__pycache__/box_ops.cpython-310.pyc b/utils/__pycache__/box_ops.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9df824f605449ea56d3269a2ba8d95888dfdd56 Binary files /dev/null and b/utils/__pycache__/box_ops.cpython-310.pyc differ diff --git a/utils/__pycache__/box_ops.cpython-37.pyc b/utils/__pycache__/box_ops.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90bd7d76e0cbe9bdb5e085702f72225f2fbff11a Binary files /dev/null and b/utils/__pycache__/box_ops.cpython-37.pyc differ diff --git a/utils/__pycache__/com_flops_params.cpython-310.pyc b/utils/__pycache__/com_flops_params.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ddee4dd15b63af5d235ee1f5c596318164fae2c Binary files /dev/null and b/utils/__pycache__/com_flops_params.cpython-310.pyc differ diff --git a/utils/__pycache__/distributed_utils.cpython-310.pyc b/utils/__pycache__/distributed_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e780f4005d174e6013cff37712c6b71b3913e8fb Binary files /dev/null and b/utils/__pycache__/distributed_utils.cpython-310.pyc differ diff --git a/utils/__pycache__/distributed_utils.cpython-37.pyc b/utils/__pycache__/distributed_utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40790e4c1f406540cd870e11fd44012cafd172c5 Binary files /dev/null and b/utils/__pycache__/distributed_utils.cpython-37.pyc differ diff --git a/utils/__pycache__/misc.cpython-310.pyc b/utils/__pycache__/misc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d92755ece4159ad4d44fc091400838963053a6b5 Binary files /dev/null and b/utils/__pycache__/misc.cpython-310.pyc differ diff --git a/utils/__pycache__/misc.cpython-37.pyc b/utils/__pycache__/misc.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ab8eb44c0de9566b0a9a4fbf0f3fc17c3628410 Binary files /dev/null and b/utils/__pycache__/misc.cpython-37.pyc differ diff --git a/utils/__pycache__/nms.cpython-310.pyc b/utils/__pycache__/nms.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f626e3568ab0f0af90f0e6015501f41ba0518a2e Binary files /dev/null and b/utils/__pycache__/nms.cpython-310.pyc differ diff --git a/utils/__pycache__/nms.cpython-37.pyc b/utils/__pycache__/nms.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c21ae3e0a7ab91386223e839954627ab40bae0b8 Binary files /dev/null and b/utils/__pycache__/nms.cpython-37.pyc differ diff --git a/utils/box_ops.py b/utils/box_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..92c882f06e8e8b4ba0d01a952407c8e91295d4e9 --- /dev/null +++ b/utils/box_ops.py @@ -0,0 +1,92 @@ +import numpy as np +import torch +from torchvision.ops.boxes import box_area + + +def get_ious(bboxes1, + bboxes2, + box_mode="xyxy", + iou_type="iou"): + """ + Compute iou loss of type ['iou', 'giou', 'linear_iou'] + + Args: + inputs (tensor): pred values + targets (tensor): target values + weight (tensor): loss weight + box_mode (str): 'xyxy' or 'ltrb', 'ltrb' is currently supported. + loss_type (str): 'giou' or 'iou' or 'linear_iou' + reduction (str): reduction manner + + Returns: + loss (tensor): computed iou loss. + """ + if box_mode == "ltrb": + bboxes1 = torch.cat((-bboxes1[..., :2], bboxes1[..., 2:]), dim=-1) + bboxes2 = torch.cat((-bboxes2[..., :2], bboxes2[..., 2:]), dim=-1) + elif box_mode != "xyxy": + raise NotImplementedError + + eps = torch.finfo(torch.float32).eps + + bboxes1_area = (bboxes1[..., 2] - bboxes1[..., 0]).clamp_(min=0) \ + * (bboxes1[..., 3] - bboxes1[..., 1]).clamp_(min=0) + bboxes2_area = (bboxes2[..., 2] - bboxes2[..., 0]).clamp_(min=0) \ + * (bboxes2[..., 3] - bboxes2[..., 1]).clamp_(min=0) + + w_intersect = (torch.min(bboxes1[..., 2], bboxes2[..., 2]) + - torch.max(bboxes1[..., 0], bboxes2[..., 0])).clamp_(min=0) + h_intersect = (torch.min(bboxes1[..., 3], bboxes2[..., 3]) + - torch.max(bboxes1[..., 1], bboxes2[..., 1])).clamp_(min=0) + + area_intersect = w_intersect * h_intersect + area_union = bboxes2_area + bboxes1_area - area_intersect + ious = area_intersect / area_union.clamp(min=eps) + + if iou_type == "iou": + return ious + elif iou_type == "giou": + g_w_intersect = torch.max(bboxes1[..., 2], bboxes2[..., 2]) \ + - torch.min(bboxes1[..., 0], bboxes2[..., 0]) + g_h_intersect = torch.max(bboxes1[..., 3], bboxes2[..., 3]) \ + - torch.min(bboxes1[..., 1], bboxes2[..., 1]) + ac_uion = g_w_intersect * g_h_intersect + gious = ious - (ac_uion - area_union) / ac_uion.clamp(min=eps) + return gious + else: + raise NotImplementedError + + +# modified from torchvision to also return the union +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + wh = (rb - lt).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +def rescale_bboxes(bboxes, orig_size): + orig_w, orig_h = orig_size[0], orig_size[1] + bboxes[..., [0, 2]] = np.clip( + bboxes[..., [0, 2]] * orig_w, a_min=0., a_max=orig_w + ) + bboxes[..., [1, 3]] = np.clip( + bboxes[..., [1, 3]] * orig_h, a_min=0., a_max=orig_h + ) + + return bboxes + + + +if __name__ == '__main__': + box1 = torch.tensor([[10, 10, 20, 20]]) + box2 = torch.tensor([[15, 15, 25, 25]]) diff --git a/utils/com_flops_params.py b/utils/com_flops_params.py new file mode 100644 index 0000000000000000000000000000000000000000..989570ded715b562d1f1d3f8fc652d360cc48425 --- /dev/null +++ b/utils/com_flops_params.py @@ -0,0 +1,25 @@ +import torch +from thop import profile + + +def FLOPs_and_Params(model, img_size, len_clip, device): + # generate init video clip + video_clip = torch.randn(1, 3, len_clip, img_size, img_size).to(device) + + # set eval mode + model.trainable = False + model.eval() + + print('==============================') + flops, params = profile(model, inputs=(video_clip, )) + print('==============================') + print('FLOPs : {:.2f} G'.format(flops / 1e9)) + print('Params : {:.2f} M'.format(params / 1e6)) + + # set train mode. + model.trainable = True + model.train() + + +if __name__ == "__main__": + pass diff --git a/utils/distributed_utils.py b/utils/distributed_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..82cc36e9cc93e6c20c9f9f30b1d90fbf3f11489f --- /dev/null +++ b/utils/distributed_utils.py @@ -0,0 +1,166 @@ +# from github: https://github.com/ruinmessi/ASFF/blob/master/utils/distributed_util.py + +import torch +import torch.distributed as dist +import os +import subprocess +import pickle + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + + # serialized to a Tensor + buffer = pickle.dumps(data) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to("cuda") + + # obtain Tensor size of each rank + local_size = torch.tensor([tensor.numel()], device="cuda") + size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] + dist.all_gather(size_list, local_size) + size_list = [int(size.item()) for size in size_list] + max_size = max(size_list) + + # receiving Tensor from all ranks + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + tensor_list = [] + for _ in size_list: + tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) + if local_size != max_size: + padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") + tensor = torch.cat((tensor, padding), dim=0) + dist.all_gather(tensor_list, tensor) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def reduce_dict(input_dict, average=True): + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that all processes + have the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.all_reduce(values) + if average: + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict + + +def get_sha(): + cwd = os.path.dirname(os.path.abspath(__file__)) + + def _run(command): + return subprocess.check_output(command, cwd=cwd).decode('ascii').strip() + sha = 'N/A' + diff = "clean" + branch = 'N/A' + try: + sha = _run(['git', 'rev-parse', 'HEAD']) + subprocess.check_output(['git', 'diff'], cwd=cwd) + diff = _run(['git', 'diff-index', 'HEAD']) + diff = "has uncommited changes" if diff else "clean" + branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) + except Exception: + pass + message = f"sha: {sha}, status: {diff}, branch: {branch}" + return message + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + else: + print('Not using distributed mode') + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}'.format( + args.rank, args.dist_url), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) diff --git a/utils/misc.py b/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..559da6196b5f87ae334e89934819d4e1dabb9fd4 --- /dev/null +++ b/utils/misc.py @@ -0,0 +1,191 @@ +import os + +import torch +import torch.nn as nn + +from dataset.ucf_jhmdb import UCF_JHMDB_Dataset +from dataset.ava import AVA_Dataset +from dataset.transforms import Augmentation, BaseTransform + +from evaluator.ucf_jhmdb_evaluator import UCF_JHMDB_Evaluator +from evaluator.ava_evaluator import AVA_Evaluator + + +def build_dataset(d_cfg, args, is_train=False): + """ + d_cfg: dataset config + """ + # transform + augmentation = Augmentation( + img_size=d_cfg['train_size'], + jitter=d_cfg['jitter'], + hue=d_cfg['hue'], + saturation=d_cfg['saturation'], + exposure=d_cfg['exposure'] + ) + basetransform = BaseTransform( + img_size=d_cfg['test_size'], + ) + + # dataset + if args.dataset in ['ucf24', 'jhmdb21']: + data_dir = os.path.join(args.root, 'ucf24') + + # dataset + dataset = UCF_JHMDB_Dataset( + data_root=data_dir, + dataset=args.dataset, + img_size=d_cfg['train_size'], + transform=augmentation, + is_train=is_train, + len_clip=args.len_clip, + sampling_rate=d_cfg['sampling_rate'] + ) + num_classes = dataset.num_classes + + # evaluator + evaluator = UCF_JHMDB_Evaluator( + data_root=data_dir, + dataset=args.dataset, + model_name=args.version, + metric='fmap', + img_size=d_cfg['test_size'], + len_clip=args.len_clip, + batch_size=args.test_batch_size, + conf_thresh=0.01, + iou_thresh=0.5, + gt_folder=d_cfg['gt_folder'], + save_path='./evaluator/eval_results/', + transform=basetransform, + collate_fn=CollateFunc() + ) + + elif args.dataset == 'ava_v2.2': + #data_dir = os.path.join(args.root, 'AVA_Dataset') + data_dir = args.root + + # dataset + dataset = AVA_Dataset( + cfg=d_cfg, + data_root=data_dir, + is_train=True, + img_size=d_cfg['train_size'], + transform=augmentation, + len_clip=args.len_clip, + sampling_rate=d_cfg['sampling_rate'] + ) + num_classes = 3 + + # evaluator + evaluator = AVA_Evaluator( + d_cfg=d_cfg, + data_root=data_dir, + img_size=d_cfg['test_size'], + len_clip=args.len_clip, + sampling_rate=d_cfg['sampling_rate'], + batch_size=args.test_batch_size, + transform=basetransform, + collate_fn=CollateFunc(), + full_test_on_val=False, + version='v2.2' + ) + + else: + print('unknow dataset !! Only support ucf24 & jhmdb21 & ava_v2.2 !!') + exit(0) + + print('==============================') + print('Training model on:', args.dataset) + print('The dataset size:', len(dataset)) + + if not args.eval: + # no evaluator during training stage + evaluator = None + + return dataset, evaluator, num_classes + + +def build_dataloader(args, dataset, batch_size, collate_fn=None, is_train=False): + if is_train: + # distributed + if args.distributed: + sampler = torch.utils.data.distributed.DistributedSampler(dataset) + else: + sampler = torch.utils.data.RandomSampler(dataset) + + batch_sampler_train = torch.utils.data.BatchSampler(sampler, + batch_size, + drop_last=True) + # train dataloader + dataloader = torch.utils.data.DataLoader( + dataset=dataset, + batch_sampler=batch_sampler_train, + collate_fn=collate_fn, + num_workers=args.num_workers, + pin_memory=True + ) + else: + # test dataloader + dataloader = torch.utils.data.DataLoader( + dataset=dataset, + shuffle=False, + collate_fn=collate_fn, + num_workers=args.num_workers, + drop_last=False, + pin_memory=True + ) + + return dataloader + + +def load_weight(model, path_to_ckpt=None): + if path_to_ckpt is None: + print('No trained weight ..') + return model + + checkpoint = torch.load(path_to_ckpt, map_location='cpu') + # checkpoint state dict + checkpoint_state_dict = checkpoint.pop("model") + # model state dict + model_state_dict = model[0].state_dict() + # check + for k in list(checkpoint_state_dict.keys()): + if k in model_state_dict: + shape_model = tuple(model_state_dict[k].shape) + shape_checkpoint = tuple(checkpoint_state_dict[k].shape) + if shape_model != shape_checkpoint: + checkpoint_state_dict.pop(k) + else: + checkpoint_state_dict.pop(k) + print(k) + + model[0].load_state_dict(checkpoint_state_dict) + print('Finished loading model!') + + return model[0] + + +def is_parallel(model): + # Returns True if model is of type DP or DDP + return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) + + +class CollateFunc(object): + def __call__(self, batch): + batch_frame_id = [] + batch_key_target = [] + batch_video_clips = [] + + for sample in batch: + key_frame_id = sample[0] + video_clip = sample[1] + key_target = sample[2] + + batch_frame_id.append(key_frame_id) + batch_video_clips.append(video_clip) + batch_key_target.append(key_target) + + # List [B, 3, T, H, W] -> [B, 3, T, H, W] + batch_video_clips = torch.stack(batch_video_clips) + + return batch_frame_id, batch_video_clips, batch_key_target diff --git a/utils/nms.py b/utils/nms.py new file mode 100644 index 0000000000000000000000000000000000000000..242f2d6750a737b1839c667d81218017dd5507a0 --- /dev/null +++ b/utils/nms.py @@ -0,0 +1,144 @@ +import numpy as np + + +def nms(bboxes, scores, nms_thresh): + """"Pure Python NMS.""" + x1 = bboxes[:, 0] #xmin + y1 = bboxes[:, 1] #ymin + x2 = bboxes[:, 2] #xmax + y2 = bboxes[:, 3] #ymax + + areas = (x2 - x1) * (y2 - y1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + # compute iou + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(1e-10, xx2 - xx1) + h = np.maximum(1e-10, yy2 - yy1) + inter = w * h + + iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-14) + #reserve all the boundingbox whose ovr less than thresh + inds = np.where(iou <= nms_thresh)[0] + order = order[inds + 1] + + return keep + + +def multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh): + # nms + keep = nms(bboxes, scores, nms_thresh) + + scores = scores[keep] + labels = labels[keep] + bboxes = bboxes[keep] + + return scores, labels, bboxes + + +def multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes): + # nms + keep = np.zeros(len(bboxes), dtype=np.int32) + for i in range(num_classes): + inds = np.where(labels == i)[0] + if len(inds) == 0: + continue + c_bboxes = bboxes[inds] + c_scores = scores[inds] + c_keep = nms(c_bboxes, c_scores, nms_thresh) + keep[inds[c_keep]] = 1 + + keep = np.where(keep > 0) + scores = scores[keep] + labels = labels[keep] + bboxes = bboxes[keep] + + return scores, labels, bboxes + + +def multiclass_nms(scores, labels, bboxes, nms_thresh, num_classes, class_agnostic=False): + if class_agnostic: + return multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh) + else: + return multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes) + + +##--------------------------------------------torch--------------------------------------- +# import torch + +# def nms(bboxes, scores, nms_thresh): +# """"Pure PyTorch NMS.""" +# x1 = bboxes[:, 0] #xmin +# y1 = bboxes[:, 1] #ymin +# x2 = bboxes[:, 2] #xmax +# y2 = bboxes[:, 3] #ymax + +# areas = (x2 - x1) * (y2 - y1) +# _, order = scores.sort(descending=True) + +# keep = [] +# while order.numel() > 0: +# i = order[0] +# keep.append(i.item()) +# # compute iou +# xx1 = torch.max(x1[i], x1[order[1:]]) +# yy1 = torch.max(y1[i], y1[order[1:]]) +# xx2 = torch.min(x2[i], x2[order[1:]]) +# yy2 = torch.min(y2[i], y2[order[1:]]) + +# w = torch.clamp(xx2 - xx1, min=1e-10) +# h = torch.clamp(yy2 - yy1, min=1e-10) +# inter = w * h + +# iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-14) +# #reserve all the boundingbox whose ovr less than thresh +# inds = torch.where(iou <= nms_thresh)[0] +# order = order[inds + 1] + +# return keep + + +# def multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh): +# # nms +# keep = nms(bboxes, scores, nms_thresh) + +# scores = scores[keep] +# labels = labels[keep] +# bboxes = bboxes[keep] + +# return scores, labels, bboxes + + +# def multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes): +# # nms +# keep = torch.zeros(len(bboxes), dtype=torch.int32) +# for i in range(num_classes): +# inds = torch.where(labels == i)[0] +# if len(inds) == 0: +# continue +# c_bboxes = bboxes[inds] +# c_scores = scores[inds] +# c_keep = nms(c_bboxes, c_scores, nms_thresh) +# keep[inds[c_keep]] = 1 + +# keep = torch.where(keep > 0) +# scores = scores[keep] +# labels = labels[keep] +# bboxes = bboxes[keep] + +# return scores, labels, bboxes + + +# def multiclass_nms(scores, labels, bboxes, nms_thresh, num_classes, class_agnostic=False): +# if class_agnostic: +# return multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh) +# else: +# return multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes) diff --git a/utils/solver/__init__.py b/utils/solver/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utils/solver/__pycache__/__init__.cpython-310.pyc b/utils/solver/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ce213fab408458d1a41198f346575986d410cd9 Binary files /dev/null and b/utils/solver/__pycache__/__init__.cpython-310.pyc differ diff --git a/utils/solver/__pycache__/optimizer.cpython-310.pyc b/utils/solver/__pycache__/optimizer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0a43a663ccad03098f2b8907757a0518bb14f2f Binary files /dev/null and b/utils/solver/__pycache__/optimizer.cpython-310.pyc differ diff --git a/utils/solver/__pycache__/warmup_schedule.cpython-310.pyc b/utils/solver/__pycache__/warmup_schedule.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e0cef77cbed1a295c2654967ac719e56107fda4 Binary files /dev/null and b/utils/solver/__pycache__/warmup_schedule.cpython-310.pyc differ diff --git a/utils/solver/optimizer.py b/utils/solver/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..22c26001188c6fee0b9157cb0b8afa7c6b729073 --- /dev/null +++ b/utils/solver/optimizer.py @@ -0,0 +1,40 @@ +import torch +from torch import optim + + +def build_optimizer(cfg, model, base_lr=0.0, resume=None): + print('==============================') + print('Optimizer: {}'.format(cfg['optimizer'])) + print('--momentum: {}'.format(cfg['momentum'])) + print('--weight_decay: {}'.format(cfg['weight_decay'])) + + if cfg['optimizer'] == 'sgd': + optimizer = optim.SGD( + model.parameters(), + lr=base_lr, + momentum=cfg['momentum'], + weight_decay=cfg['weight_decay']) + + elif cfg['optimizer'] == 'adam': + optimizer = optim.Adam( + model.parameters(), + lr=base_lr, + eight_decay=cfg['weight_decay']) + + elif cfg['optimizer'] == 'adamw': + optimizer = optim.AdamW( + model.parameters(), + lr=base_lr, + weight_decay=cfg['weight_decay']) + + start_epoch = 0 + if resume is not None: + print('keep training: ', resume) + checkpoint = torch.load(resume) + # checkpoint state dict + checkpoint_state_dict = checkpoint.pop("optimizer") + optimizer.load_state_dict(checkpoint_state_dict) + start_epoch = checkpoint.pop("epoch") + + + return optimizer, start_epoch diff --git a/utils/solver/warmup_schedule.py b/utils/solver/warmup_schedule.py new file mode 100644 index 0000000000000000000000000000000000000000..c88398a840fda64c85be014c1531b2599316257b --- /dev/null +++ b/utils/solver/warmup_schedule.py @@ -0,0 +1,58 @@ + +# Build warmup scheduler + + +def build_warmup(cfg, base_lr=0.01): + print('==============================') + print('WarmUpScheduler: {}'.format(cfg['warmup'])) + print('--base_lr: {}'.format(base_lr)) + print('--warmup_factor: {}'.format(cfg['warmup_factor'])) + print('--wp_iter: {}'.format(cfg['wp_iter'])) + + warmup_scheduler = WarmUpScheduler( + name=cfg['warmup'], + base_lr=base_lr, + wp_iter=cfg['wp_iter'], + warmup_factor=cfg['warmup_factor'] + ) + + return warmup_scheduler + + +# Basic Warmup Scheduler +class WarmUpScheduler(object): + def __init__(self, + name='linear', + base_lr=0.01, + wp_iter=500, + warmup_factor=0.00066667): + self.name = name + self.base_lr = base_lr + self.wp_iter = wp_iter + self.warmup_factor = warmup_factor + + + def set_lr(self, optimizer, lr, base_lr): + for param_group in optimizer.param_groups: + init_lr = param_group['initial_lr'] + ratio = init_lr / base_lr + param_group['lr'] = lr * ratio + + + def warmup(self, iter, optimizer): + # warmup + assert iter < self.wp_iter + if self.name == 'exp': + tmp_lr = self.base_lr * pow(iter / self.wp_iter, 4) + self.set_lr(optimizer, tmp_lr, self.base_lr) + + elif self.name == 'linear': + alpha = iter / self.wp_iter + warmup_factor = self.warmup_factor * (1 - alpha) + alpha + tmp_lr = self.base_lr * warmup_factor + self.set_lr(optimizer, tmp_lr, self.base_lr) + + + def __call__(self, iter, optimizer): + self.warmup(iter, optimizer) + \ No newline at end of file diff --git a/utils/vis_tools.py b/utils/vis_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..f5c48b614125ae081dde0f6c1804825f60d5478e --- /dev/null +++ b/utils/vis_tools.py @@ -0,0 +1,87 @@ +import cv2 +import numpy as np + + +def vis_targets(video_clips, targets): + """ + video_clips: (Tensor) -> [B, C, T, H, W] + targets: List[Dict] -> [{'boxes': (Tensor) [N, 4], + 'labels': (Tensor) [N,]}, + ...], + """ + batch_size = len(video_clips) + + for batch_index in range(batch_size): + video_clip = video_clips[batch_index] + target = targets[batch_index] + + key_frame = video_clip[:, :, -1, :, :] + tgt_bboxes = target['boxes'] + tgt_labels = target['labels'] + + key_frame = convert_tensor_to_cv2img(key_frame) + width, height = key_frame.shape[:-1] + + for box, label in zip(tgt_bboxes, tgt_labels): + x1, y1, x2, y2 = box + label = int(label) + + x1 *= width + y1 *= height + x2 *= width + y2 *= height + + # draw bbox + cv2.rectangle(key_frame, + (int(x1), int(y1)), + (int(x2), int(y2)), + (255, 0, 0), 2) + cv2.imshow('groundtruth', key_frame) + cv2.waitKey(0) + + +def convert_tensor_to_cv2img(img_tensor): + """ convert torch.Tensor to cv2 image """ + # to numpy + img_tensor = img_tensor.permute(1, 2, 0).cpu().numpy() + # to cv2 img Mat + cv2_img = img_tensor.astype(np.uint8) + # to BGR + cv2_img = cv2_img.copy()[..., (2, 1, 0)] + + return cv2_img + + +def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4): + x1, y1, x2, y2 = bbox + x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) + t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0] + # plot bbox + cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2) + + if label is not None: + # plot title bbox + cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1) + # put the test on the title bbox + cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA) + + return img + + +def vis_detection(frame, scores, labels, bboxes, vis_thresh, class_names, class_colors): + ts = 0.4 + for i, bbox in enumerate(bboxes): + if scores[i] > vis_thresh: + label = int(labels[i]) + cls_color = class_colors[label] + + if len(class_names) > 1: + mess = '%s: %.2f' % (class_names[label], scores[i]) + else: + cls_color = [255, 0, 0] + mess = None + # visualize bbox + frame = plot_bbox_labels(frame, bbox, mess, cls_color, text_scale=ts) + + return frame + \ No newline at end of file diff --git a/utils/weight_init.py b/utils/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..5fe88bee46036f3de10a4b3427e3095751ec6ab9 --- /dev/null +++ b/utils/weight_init.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import math + +import torch.nn as nn + + +def constant_init(module, val, bias=0): + nn.init.constant_(module.weight, val) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def xavier_init(module, gain=1, bias=0, distribution='normal'): + assert distribution in ['uniform', 'normal'] + if distribution == 'uniform': + nn.init.xavier_uniform_(module.weight, gain=gain) + else: + nn.init.xavier_normal_(module.weight, gain=gain) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def normal_init(module, mean=0, std=1, bias=0): + nn.init.normal_(module.weight, mean, std) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def uniform_init(module, a=0, b=1, bias=0): + nn.init.uniform_(module.weight, a, b) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def kaiming_init(module, + a=0, + mode='fan_out', + nonlinearity='relu', + bias=0, + distribution='normal'): + assert distribution in ['uniform', 'normal'] + if distribution == 'uniform': + nn.init.kaiming_uniform_(module.weight, + a=a, + mode=mode, + nonlinearity=nonlinearity) + else: + nn.init.kaiming_normal_(module.weight, + a=a, + mode=mode, + nonlinearity=nonlinearity) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def caffe2_xavier_init(module, bias=0): + # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch + # Acknowledgment to FAIR's internal code + kaiming_init(module, + a=1, + mode='fan_in', + nonlinearity='leaky_relu', + bias=bias, + distribution='uniform') + + +def c2_xavier_fill(module: nn.Module): + """ + Initialize `module.weight` using the "XavierFill" implemented in Caffe2. + Also initializes `module.bias` to 0. + + Args: + module (torch.nn.Module): module to initialize. + """ + # Caffe2 implementation of XavierFill in fact + # corresponds to kaiming_uniform_ in PyTorch + nn.init.kaiming_uniform_(module.weight, a=1) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + + +def c2_msra_fill(module: nn.Module): + """ + Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. + Also initializes `module.bias` to 0. + + Args: + module (torch.nn.Module): module to initialize. + """ + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + if module.bias is not None: + nn.init.constant_(module.bias, 0) + + +def init_weights(m: nn.Module, zero_init_final_gamma=False): + """Performs ResNet-style weight initialization.""" + if isinstance(m, nn.Conv2d): + # Note that there is no bias due to BN + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out)) + elif isinstance(m, nn.BatchNorm2d): + zero_init_gamma = ( + hasattr(m, "final_bn") and m.final_bn and zero_init_final_gamma + ) + m.weight.data.fill_(0.0 if zero_init_gamma else 1.0) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + m.weight.data.normal_(mean=0.0, std=0.01) + m.bias.data.zero_()