diff --git a/models/__init__.py b/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..819bdb3de4f8c479c8a0aa621e3cdda68f9bed3b
--- /dev/null
+++ b/models/__init__.py
@@ -0,0 +1,24 @@
+from .yowo.build import build_yowo
+
+
+def build_model(args,
+                d_cfg,
+                m_cfg, 
+                device, 
+                num_classes=3, 
+                trainable=False,
+                resume=None):
+    # build action detector
+    if 'yowo_v2_' in args.version:
+        model, criterion = build_yowo(
+            args=args,
+            d_cfg=d_cfg,
+            m_cfg=m_cfg,
+            device=device,
+            num_classes=num_classes,
+            trainable=trainable,
+            resume=resume
+            )
+
+    return model, criterion
+
diff --git a/models/__pycache__/__init__.cpython-310.pyc b/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b3acd38b4ec64b1db5fb5754d28623c990b467c
Binary files /dev/null and b/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/__pycache__/__init__.cpython-37.pyc b/models/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b20147f220b6ca2df4d7ad4cb748230bbd8efc5
Binary files /dev/null and b/models/__pycache__/__init__.cpython-37.pyc differ
diff --git a/models/backbone/__init__.py b/models/backbone/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3a5153e798ecd8df364dd3c127d639b0aa4119b
--- /dev/null
+++ b/models/backbone/__init__.py
@@ -0,0 +1,13 @@
+from .backbone_2d.backbone_2d import Backbone2D
+from .backbone_3d.backbone_3d import Backbone3D
+
+
+def build_backbone_2d(cfg, pretrained=False):
+    backbone = Backbone2D(cfg, pretrained)
+    return backbone, backbone.feat_dims
+
+
+def build_backbone_3d(cfg, pretrained=False):
+    backbone = Backbone3D(cfg, pretrained)
+    return backbone, backbone.feat_dim
+
diff --git a/models/backbone/__pycache__/__init__.cpython-310.pyc b/models/backbone/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c36c9c2b39ea12160b96ff51a4b381158902e1bb
Binary files /dev/null and b/models/backbone/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/backbone/__pycache__/__init__.cpython-37.pyc b/models/backbone/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c69df6694d2cccf6de591a4cf38a11588604641f
Binary files /dev/null and b/models/backbone/__pycache__/__init__.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/__init__.py b/models/backbone/backbone_2d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/backbone/backbone_2d/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_2d/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fbfa4abfce6effb76b526810d570bf3f9938105
Binary files /dev/null and b/models/backbone/backbone_2d/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_2d/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..270f75632217613a7b732a068bc67ad716787e05
Binary files /dev/null and b/models/backbone/backbone_2d/__pycache__/__init__.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-310.pyc b/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5ae861007148c7c7e03e7f2b42f194cd01abe4d
Binary files /dev/null and b/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-37.pyc b/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63530b430b0798e51d18cdeab17719fb96091084
Binary files /dev/null and b/models/backbone/backbone_2d/__pycache__/backbone_2d.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/backbone_2d.py b/models/backbone/backbone_2d/backbone_2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1bd29ab84d0ecf995408ada88f5d7d89f7e5ad0
--- /dev/null
+++ b/models/backbone/backbone_2d/backbone_2d.py
@@ -0,0 +1,26 @@
+import torch.nn as nn
+from .cnn_2d import build_2d_cnn
+
+
+class Backbone2D(nn.Module):
+    def __init__(self, cfg, pretrained=False):
+        super().__init__()
+        self.cfg = cfg
+
+        self.backbone, self.feat_dims = build_2d_cnn(cfg, pretrained)
+
+        
+    def forward(self, x):
+        """
+            Input:
+                x: (Tensor) -> [B, C, H, W]
+            Output:
+                y: (List) -> [
+                    (Tensor) -> [B, C1, H1, W1],
+                    (Tensor) -> [B, C2, H2, W2],
+                    (Tensor) -> [B, C3, H3, W3]
+                    ]
+        """
+        feat = self.backbone(x)
+
+        return feat
diff --git a/models/backbone/backbone_2d/cnn_2d/__init__.py b/models/backbone/backbone_2d/cnn_2d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a154b5eae138160bf94571987cc4fa7f39e0337
--- /dev/null
+++ b/models/backbone/backbone_2d/cnn_2d/__init__.py
@@ -0,0 +1,18 @@
+# import 2D backbone
+from .yolo_free.yolo_free import build_yolo_free
+
+
+def build_2d_cnn(cfg, pretrained=False):
+    print('==============================')
+    print('2D Backbone: {}'.format(cfg['backbone_2d'].upper()))
+    print('--pretrained: {}'.format(pretrained))
+
+    if cfg['backbone_2d'] in ['yolo_free_nano', 'yolo_free_tiny', \
+                              'yolo_free_large', 'yolo_free_huge']:
+        model, feat_dims = build_yolo_free(cfg['backbone_2d'], pretrained)
+
+    else:
+        print('Unknown 2D Backbone ...')
+        exit()
+
+    return model, feat_dims
diff --git a/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7342feccce6c85a08c1e24197bfa3e5c777eeed4
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..607f066d315135751754439bf8a36382aa892ad8
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/__pycache__/__init__.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__init__.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef62940c9d0c0e25f0a1f6aa385af15922c69dbf
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e72faac2da945664156f78974b9d4167e6a2ee47
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/__init__.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80119bd82a6b8e27c9760b2ce1bf8c4a6f107dc9
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d71a11cdd2518366c7fdfba9a4b3beb4c0d0f35
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..087099479daff4b314f1c063c79c816ad019e03b
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..271c9338c44a1f877c78a404c00e56c39181b920
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_backbone.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f38fb3ec74167e45cf1425f421d2503c23020b7
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d7ba91096fdb0f55c53098a8627034c28c18cce
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_basic.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9db270964648ff57b647256636684abc37ebda38
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27e231f8ee486ae0460f887fe9e8b55eafd345f6
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_fpn.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52e652ad3091bba56d0a3da0b655d202846be6df
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..172e558d58bca7493c5ec5a7e2c971b3eb048be8
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_head.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-310.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f095d721ac01b5cb771de3fb84c49b02f7ac580e
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-310.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-37.pyc b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65408633fbceee510a698967fa95f2a06b4ff3d7
Binary files /dev/null and b/models/backbone/backbone_2d/cnn_2d/yolo_free/__pycache__/yolo_free_neck.cpython-37.pyc differ
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free.py
new file mode 100644
index 0000000000000000000000000000000000000000..75128153e569a50b1d2b9ebc82f33fd5294b12d9
--- /dev/null
+++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free.py
@@ -0,0 +1,222 @@
+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.hub import load_state_dict_from_url
+
+try:
+    from .yolo_free_backbone import build_backbone
+    from .yolo_free_neck import build_neck
+    from .yolo_free_fpn import build_fpn
+    from .yolo_free_head import build_head
+except:
+    from yolo_free_backbone import build_backbone
+    from yolo_free_neck import build_neck
+    from yolo_free_fpn import build_fpn
+    from yolo_free_head import build_head
+
+
+__all__ = ['build_yolo_free']
+
+
+model_urls = {
+    'yolo_free_nano': 'https://github.com/yjh0410/FreeYOLO/releases/download/weight/yolo_free_nano_coco.pth',
+    'yolo_free_tiny': 'https://github.com/yjh0410/FreeYOLO/releases/download/weight/yolo_free_tiny_coco.pth',
+    'yolo_free_large': 'https://github.com/yjh0410/FreeYOLO/releases/download/weight/yolo_free_large_coco.pth',
+}
+
+
+yolo_free_config = {
+    'yolo_free_nano': {
+        # model
+        'backbone': 'shufflenetv2_1.0x',
+        'pretrained': True,
+        'stride': [8, 16, 32],  # P3, P4, P5
+        'anchor_size': None,
+        # neck
+        'neck': 'sppf',
+        'neck_dim': 232,
+        'expand_ratio': 0.5,
+        'pooling_size': 5,
+        'neck_act': 'lrelu',
+        'neck_norm': 'BN',
+        'neck_depthwise': True,
+        # fpn
+        'fpn': 'pafpn_elan',
+        'fpn_size': 'nano',
+        'fpn_dim': [116, 232, 232],
+        'fpn_norm': 'BN',
+        'fpn_act': 'lrelu',
+        'fpn_depthwise': True,
+        # head
+        'head': 'decoupled_head',
+        'head_dim': 64,
+        'head_norm': 'BN',
+        'head_act': 'lrelu',
+        'num_cls_head': 2,
+        'num_reg_head': 2,
+        'head_depthwise': True,
+        },
+
+    'yolo_free_tiny': {
+        # model
+        'backbone': 'elannet_tiny',
+        'pretrained': True,
+        'stride': [8, 16, 32],  # P3, P4, P5
+        # neck
+        'neck': 'spp_block_csp',
+        'neck_dim': 256,
+        'expand_ratio': 0.5,
+        'pooling_size': [5, 9, 13],
+        'neck_act': 'lrelu',
+        'neck_norm': 'BN',
+        'neck_depthwise': False,
+        # fpn
+        'fpn': 'pafpn_elan',
+        'fpn_size': 'tiny', # 'tiny', 'large', 'huge
+        'fpn_dim': [128, 256, 256],
+        'fpn_norm': 'BN',
+        'fpn_act': 'lrelu',
+        'fpn_depthwise': False,
+        # head
+        'head': 'decoupled_head',
+        'head_dim': 64,
+        'head_norm': 'BN',
+        'head_act': 'lrelu',
+        'num_cls_head': 2,
+        'num_reg_head': 2,
+        'head_depthwise': False,
+        },
+
+    'yolo_free_large': {
+        # model
+        'backbone': 'elannet_large',
+        'pretrained': True,
+        'stride': [8, 16, 32],  # P3, P4, P5
+        # neck
+        'neck': 'spp_block_csp',
+        'neck_dim': 512,
+        'expand_ratio': 0.5,
+        'pooling_size': [5, 9, 13],
+        'neck_act': 'silu',
+        'neck_norm': 'BN',
+        'neck_depthwise': False,
+        # fpn
+        'fpn': 'pafpn_elan',
+        'fpn_size': 'large', # 'tiny', 'large', 'huge
+        'fpn_dim': [512, 1024, 512],
+        'fpn_norm': 'BN',
+        'fpn_act': 'silu',
+        'fpn_depthwise': False,
+        # head
+        'head': 'decoupled_head',
+        'head_dim': 256,
+        'head_norm': 'BN',
+        'head_act': 'silu',
+        'num_cls_head': 2,
+        'num_reg_head': 2,
+        'head_depthwise': False,
+        },
+
+}
+
+
+# Anchor-free YOLO
+class FreeYOLO(nn.Module):
+    def __init__(self, cfg):
+        super(FreeYOLO, self).__init__()
+        # --------- Basic Config -----------
+        self.cfg = cfg
+
+        # --------- Network Parameters ----------
+        ## backbone
+        self.backbone, bk_dim = build_backbone(self.cfg['backbone'])
+
+        ## neck
+        self.neck = build_neck(cfg=self.cfg, in_dim=bk_dim[-1], out_dim=self.cfg['neck_dim'])
+        
+        ## fpn
+        self.fpn = build_fpn(cfg=self.cfg, in_dims=self.cfg['fpn_dim'], out_dim=self.cfg['head_dim'])
+
+        ## non-shared heads
+        self.non_shared_heads = nn.ModuleList(
+            [build_head(cfg) 
+            for _ in range(len(cfg['stride']))
+            ])
+
+    def forward(self, x):
+        # backbone
+        feats = self.backbone(x)
+
+        # neck
+        feats['layer4'] = self.neck(feats['layer4'])
+
+        # fpn
+        pyramid_feats = [feats['layer2'], feats['layer3'], feats['layer4']]
+        pyramid_feats = self.fpn(pyramid_feats)
+
+        # non-shared heads
+        all_cls_feats = []
+        all_reg_feats = []
+        for feat, head in zip(pyramid_feats, self.non_shared_heads):
+            # [B, C, H, W]
+            cls_feat, reg_feat = head(feat)
+
+            all_cls_feats.append(cls_feat)
+            all_reg_feats.append(reg_feat)
+
+        return all_cls_feats, all_reg_feats
+
+
+# build FreeYOLO
+def build_yolo_free(model_name='yolo_free_large', pretrained=False):
+    # model config
+    cfg = yolo_free_config[model_name]
+
+    # FreeYOLO
+    model = FreeYOLO(cfg)
+    feat_dims = [model.cfg['head_dim']] * 3
+
+    # Load COCO pretrained weight
+    if pretrained:
+        url = model_urls[model_name]
+
+        # check
+        if url is None:
+            print('No 2D pretrained weight ...')
+            return model, feat_dims
+        else:
+            print('Loading 2D backbone pretrained weight: {}'.format(model_name.upper()))
+
+            # state dict
+            checkpoint = load_state_dict_from_url(url, map_location='cpu')
+            checkpoint_state_dict = checkpoint.pop('model')
+
+            # model state dict
+            model_state_dict = model.state_dict()
+            # check
+            for k in list(checkpoint_state_dict.keys()):
+                if k in model_state_dict:
+                    shape_model = tuple(model_state_dict[k].shape)
+                    shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
+                    if shape_model != shape_checkpoint:
+                        # print(k)
+                        checkpoint_state_dict.pop(k)
+                else:
+                    checkpoint_state_dict.pop(k)
+                    # print(k)
+
+            model.load_state_dict(checkpoint_state_dict, strict=False)
+
+    return model, feat_dims
+
+
+if __name__ == '__main__':
+    model, fpn_dim = build_yolo_free(model_name='yolo_free_nano', pretrained=True)
+    model.eval()
+
+    x = torch.randn(2, 3, 64, 64)
+    cls_feats, reg_feats = model(x)
+
+    for cls_feat, reg_feat in zip(cls_feats, reg_feats):
+        print(cls_feat.shape, reg_feat.shape)
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_backbone.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3b79c770c297dcc4678c945c20a2f13483c795
--- /dev/null
+++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_backbone.py
@@ -0,0 +1,445 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+__all__ = ['build_backbone']
+
+# ======================  ELAN-Net ==========================
+# ELANNet
+def get_activation(act_type=None):
+    if act_type is None:
+        return nn.Identity()
+    elif act_type == 'relu':
+        return nn.ReLU(inplace=True)
+    elif act_type == 'lrelu':
+        return nn.LeakyReLU(0.1, inplace=True)
+    elif act_type == 'mish':
+        return nn.Mish(inplace=True)
+    elif act_type == 'silu':
+        return nn.SiLU(inplace=True)
+
+
+def get_norm(in_dim, norm_type=None):
+    if norm_type is None:
+        return nn.Identity()
+    elif norm_type == 'BN':
+        return nn.BatchNorm2d(in_dim)
+    elif norm_type == 'GN':
+        return nn.GroupNorm(32, in_dim)
+    elif norm_type == 'IN':
+        return nn.InstanceNorm2d(in_dim)
+
+
+class Conv(nn.Module):
+    def __init__(self, 
+                 c1,                   # in channels
+                 c2,                   # out channels 
+                 k=1,                  # kernel size 
+                 p=0,                  # padding
+                 s=1,                  # padding
+                 d=1,                  # dilation
+                 act_type='silu',
+                 norm_type='BN',       # activation
+                 depthwise=False):
+        super(Conv, self).__init__()
+        convs = []
+        add_bias = False if norm_type else True
+        if depthwise:
+            # depthwise conv
+            convs.append(nn.Conv2d(c1, c1, kernel_size=k, stride=s, padding=p, dilation=d, groups=c1, bias=add_bias))
+            convs.append(get_norm(c1, norm_type))
+            convs.append(get_activation(act_type))
+
+            # pointwise conv
+            convs.append(nn.Conv2d(c1, c2, kernel_size=1, stride=s, padding=0, dilation=d, groups=1, bias=add_bias))
+            convs.append(get_norm(c2, norm_type))
+            convs.append(get_activation(act_type))
+
+        else:
+            convs.append(nn.Conv2d(c1, c2, kernel_size=k, stride=s, padding=p, dilation=d, groups=1, bias=add_bias))
+            convs.append(get_norm(c2, norm_type))
+            convs.append(get_activation(act_type))
+
+        self.convs = nn.Sequential(*convs)
+
+
+    def forward(self, x):
+        return self.convs(x)
+
+
+class ELANBlock(nn.Module):
+    """
+    ELAN BLock of YOLOv7's backbone
+    """
+    def __init__(self, in_dim, out_dim, expand_ratio=0.5, model_size='large', act_type='silu', depthwise=False):
+        super(ELANBlock, self).__init__()
+        inter_dim = int(in_dim * expand_ratio)
+        if model_size == 'tiny':
+            depth = 1
+        elif model_size == 'large':
+            depth = 2
+        elif model_size == 'huge':
+            depth = 3
+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type)
+        self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type)
+        self.cv3 = nn.Sequential(*[
+            Conv(inter_dim, inter_dim, k=3, p=1, act_type=act_type, depthwise=depthwise)
+            for _ in range(depth)
+        ])
+        self.cv4 = nn.Sequential(*[
+            Conv(inter_dim, inter_dim, k=3, p=1, act_type=act_type, depthwise=depthwise)
+            for _ in range(depth)
+        ])
+
+        self.out = Conv(inter_dim*4, out_dim, k=1)
+
+
+
+    def forward(self, x):
+        """
+        Input:
+            x: [B, C, H, W]
+        Output:
+            out: [B, 2C, H, W]
+        """
+        x1 = self.cv1(x)
+        x2 = self.cv2(x)
+        x3 = self.cv3(x2)
+        x4 = self.cv4(x3)
+
+        # [B, C, H, W] -> [B, 2C, H, W]
+        out = self.out(torch.cat([x1, x2, x3, x4], dim=1))
+
+        return out
+
+
+class DownSample(nn.Module):
+    def __init__(self, in_dim, act_type='silu', norm_type='BN'):
+        super().__init__()
+        inter_dim = in_dim // 2
+        self.mp = nn.MaxPool2d((2, 2), 2)
+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.cv2 = nn.Sequential(
+            Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type),
+            Conv(inter_dim, inter_dim, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type)
+        )
+
+    def forward(self, x):
+        """
+        Input:
+            x: [B, C, H, W]
+        Output:
+            out: [B, C, H//2, W//2]
+        """
+        # [B, C, H, W] -> [B, C//2, H//2, W//2]
+        x1 = self.cv1(self.mp(x))
+        x2 = self.cv2(x)
+
+        # [B, C, H//2, W//2]
+        out = torch.cat([x1, x2], dim=1)
+
+        return out
+
+
+# ELANNet-Tiny
+class ELANNet_Tiny(nn.Module):
+    """
+    ELAN-Net of YOLOv7-Tiny.
+    """
+    def __init__(self, depthwise=False):
+        super(ELANNet_Tiny, self).__init__()
+        
+        # tiny backbone
+        self.layer_1 = Conv(3, 32, k=3, p=1, s=2, act_type='lrelu', depthwise=depthwise)       # P1/2
+
+        self.layer_2 = nn.Sequential(   
+            Conv(32, 64, k=3, p=1, s=2, act_type='lrelu', depthwise=depthwise),             
+            ELANBlock(in_dim=64, out_dim=64, expand_ratio=0.5,
+                      model_size='tiny', act_type='lrelu', depthwise=depthwise)                  # P2/4
+        )
+        self.layer_3 = nn.Sequential(
+            nn.MaxPool2d((2, 2), 2),             
+            ELANBlock(in_dim=64, out_dim=128, expand_ratio=0.5,
+                      model_size='tiny', act_type='lrelu', depthwise=depthwise)                  # P3/8
+        )
+        self.layer_4 = nn.Sequential(
+            nn.MaxPool2d((2, 2), 2),             
+            ELANBlock(in_dim=128, out_dim=256, expand_ratio=0.5,
+                      model_size='tiny', act_type='lrelu', depthwise=depthwise)                  # P4/16
+        )
+        self.layer_5 = nn.Sequential(
+            nn.MaxPool2d((2, 2), 2),             
+            ELANBlock(in_dim=256, out_dim=512, expand_ratio=0.5,
+                      model_size='tiny', act_type='lrelu', depthwise=depthwise)                  # P5/32
+        )
+
+
+    def forward(self, x):
+        c1 = self.layer_1(x)
+        c2 = self.layer_2(c1)
+        c3 = self.layer_3(c2)
+        c4 = self.layer_4(c3)
+        c5 = self.layer_5(c4)
+
+        outputs = {
+            'layer2': c3,
+            'layer3': c4,
+            'layer4': c5
+        }
+        return outputs
+
+
+# ELANNet-Large
+class ELANNet_Large(nn.Module):
+    """
+    ELAN-Net of YOLOv7.
+    """
+    def __init__(self, depthwise=False):
+        super(ELANNet_Large, self).__init__()
+        
+        # large backbone
+        self.layer_1 = nn.Sequential(
+            Conv(3, 32, k=3, p=1, act_type='silu', depthwise=depthwise),      
+            Conv(32, 64, k=3, p=1, s=2, act_type='silu', depthwise=depthwise),
+            Conv(64, 64, k=3, p=1, act_type='silu', depthwise=depthwise)                                                   # P1/2
+        )
+        self.layer_2 = nn.Sequential(   
+            Conv(64, 128, k=3, p=1, s=2, act_type='silu', depthwise=depthwise),             
+            ELANBlock(in_dim=128, out_dim=256, expand_ratio=0.5,
+                      model_size='large',act_type='silu', depthwise=depthwise)                     # P2/4
+        )
+        self.layer_3 = nn.Sequential(
+            DownSample(in_dim=256, act_type='silu'),             
+            ELANBlock(in_dim=256, out_dim=512, expand_ratio=0.5,
+                      model_size='large',act_type='silu', depthwise=depthwise)                     # P3/8
+        )
+        self.layer_4 = nn.Sequential(
+            DownSample(in_dim=512, act_type='silu'),             
+            ELANBlock(in_dim=512, out_dim=1024, expand_ratio=0.5,
+                      model_size='large',act_type='silu', depthwise=depthwise)                    # P4/16
+        )
+        self.layer_5 = nn.Sequential(
+            DownSample(in_dim=1024, act_type='silu'),             
+            ELANBlock(in_dim=1024, out_dim=1024, expand_ratio=0.25,
+                      model_size='large',act_type='silu', depthwise=depthwise)                  # P5/32
+        )
+
+
+    def forward(self, x):
+        c1 = self.layer_1(x)
+        c2 = self.layer_2(c1)
+        c3 = self.layer_3(c2)
+        c4 = self.layer_4(c3)
+        c5 = self.layer_5(c4)
+
+        outputs = {
+            'layer2': c3,
+            'layer3': c4,
+            'layer4': c5
+        }
+        return outputs
+
+
+## build ELAN-Net
+def build_elannet(model_name='elannet_large'):
+    # model
+    if model_name == 'elannet_large':
+        backbone = ELANNet_Large()
+        feat_dims = [512, 1024, 1024]
+    elif model_name == 'elannet_tiny':
+        backbone = ELANNet_Tiny()
+        feat_dims = [128, 256, 512]
+
+    return backbone, feat_dims
+
+
+# ====================== ShuffleNet-v2 ==========================
+# ShuffleNet-v2
+def channel_shuffle(x, groups):
+    # type: (torch.Tensor, int) -> torch.Tensor
+    batchsize, num_channels, height, width = x.data.size()
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = x.view(batchsize, groups,
+               channels_per_group, height, width)
+
+    x = torch.transpose(x, 1, 2).contiguous()
+
+    # flatten
+    x = x.view(batchsize, -1, height, width)
+
+    return x
+
+
+class ShuffleV2Block(nn.Module):
+    def __init__(self, inp, oup, stride):
+        super(ShuffleV2Block, self).__init__()
+
+        if not (1 <= stride <= 3):
+            raise ValueError('illegal stride value')
+        self.stride = stride
+
+        branch_features = oup // 2
+        assert (self.stride != 1) or (inp == branch_features << 1)
+
+        if self.stride > 1:
+            self.branch1 = nn.Sequential(
+                self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1),
+                nn.BatchNorm2d(inp),
+                nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(branch_features),
+                nn.ReLU(inplace=True),
+            )
+        else:
+            self.branch1 = nn.Sequential()
+
+        self.branch2 = nn.Sequential(
+            nn.Conv2d(inp if (self.stride > 1) else branch_features,
+                      branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(branch_features),
+            nn.ReLU(inplace=True),
+            self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1),
+            nn.BatchNorm2d(branch_features),
+            nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(branch_features),
+            nn.ReLU(inplace=True),
+        )
+
+    @staticmethod
+    def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False):
+        return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i)
+
+    def forward(self, x):
+        if self.stride == 1:
+            x1, x2 = x.chunk(2, dim=1)
+            out = torch.cat((x1, self.branch2(x2)), dim=1)
+        else:
+            out = torch.cat((self.branch1(x), self.branch2(x)), dim=1)
+
+        out = channel_shuffle(out, 2)
+
+        return out
+
+
+class ShuffleNetV2(nn.Module):
+    def __init__(self,
+                 model_size='1.0x',
+                 out_stages=(2, 3, 4),
+                 with_last_conv=False,
+                 kernal_size=3):
+        super(ShuffleNetV2, self).__init__()
+        print('model size is ', model_size)
+
+        self.stage_repeats = [4, 8, 4]
+        self.model_size = model_size
+        self.out_stages = out_stages
+        self.with_last_conv = with_last_conv
+        self.kernal_size = kernal_size
+        if model_size == '0.5x':
+            self._stage_out_channels = [24, 48, 96, 192]
+        elif model_size == '1.0x':
+            self._stage_out_channels = [24, 116, 232, 464]
+        elif model_size == '1.5x':
+            self._stage_out_channels = [24, 176, 352, 704]
+        elif model_size == '2.0x':
+            self._stage_out_channels = [24, 244, 488, 976]
+        else:
+            raise NotImplementedError
+
+        # building first layer
+        input_channels = 3
+        output_channels = self._stage_out_channels[0]
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(input_channels, output_channels, 3, 2, 1, bias=False),
+            nn.BatchNorm2d(output_channels),
+            nn.ReLU(inplace=True),
+        )
+        input_channels = output_channels
+
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        stage_names = ['stage{}'.format(i) for i in [2, 3, 4]]
+        for name, repeats, output_channels in zip(
+                stage_names, self.stage_repeats, self._stage_out_channels[1:]):
+            seq = [ShuffleV2Block(input_channels, output_channels, 2)]
+            for i in range(repeats - 1):
+                seq.append(ShuffleV2Block(output_channels, output_channels, 1))
+            setattr(self, name, nn.Sequential(*seq))
+            input_channels = output_channels
+        
+        self._initialize_weights()
+
+
+    def _initialize_weights(self):
+        print('init weights...')
+        for name, m in self.named_modules():
+            if isinstance(m, nn.Conv2d):
+                if 'first' in name:
+                    nn.init.normal_(m.weight, 0, 0.01)
+                else:
+                    nn.init.normal_(m.weight, 0, 1.0 / m.weight.shape[1])
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0.0001)
+                nn.init.constant_(m.running_mean, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0.0001)
+                nn.init.constant_(m.running_mean, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        output = {}
+        for i in range(2, 5):
+            stage = getattr(self, 'stage{}'.format(i))
+            x = stage(x)
+            if i in self.out_stages:
+                output['layer{}'.format(i)] = x
+
+        return output
+
+
+## build ShuffleNet-v2
+def build_shufflenetv2(model_size='1.0x'):
+    """Constructs a shufflenetv2 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    backbone = ShuffleNetV2(model_size=model_size)
+    feat_dims = backbone._stage_out_channels[1:]
+
+    return backbone, feat_dims
+
+
+# build backbone
+def build_backbone(model_name='elannet_large'):
+    if model_name in ['elannet_nano', 'elannet_tiny', 'elannet_large', 'elannet_huge']:
+        return build_elannet(model_name)
+
+    elif model_name in ['shufflenetv2_0.5x', 'shufflenetv2_1.0x']:
+        return build_shufflenetv2(model_size=model_name[-4:])
+        
+
+if __name__ == '__main__':
+    import time
+    model, feats = build_backbone(model_name='shufflenetv2_1.0x')
+    x = torch.randn(1, 3, 224, 224)
+    t0 = time.time()
+    outputs = model(x)
+    t1 = time.time()
+    print('Time: ', t1 - t0)
+    for k in outputs.keys():
+        print(outputs[k].shape)
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_basic.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_basic.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cd708b27cb5e628d93096bba31abc4d5a82a325
--- /dev/null
+++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_basic.py
@@ -0,0 +1,164 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SiLU(nn.Module):
+    """export-friendly version of nn.SiLU()"""
+
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+def get_conv2d(c1, c2, k, p, s, d, g, padding_mode='ZERO', bias=False):
+    if padding_mode == 'ZERO':
+        conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
+    elif padding_mode == 'SAME':
+        conv = Conv2dSamePadding(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
+
+    return conv
+
+
+def get_activation(act_type=None):
+    if act_type == 'relu':
+        return nn.ReLU(inplace=True)
+    elif act_type == 'lrelu':
+        return nn.LeakyReLU(0.1, inplace=True)
+    elif act_type == 'mish':
+        return nn.Mish(inplace=True)
+    elif act_type == 'silu':
+        return nn.SiLU(inplace=True)
+
+
+def get_norm(norm_type, dim):
+    if norm_type == 'BN':
+        return nn.BatchNorm2d(dim)
+    elif norm_type == 'GN':
+        return nn.GroupNorm(num_groups=32, num_channels=dim)
+
+
+# Conv2d with "SAME" padding
+class Conv2dSamePadding(nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support "SAME" padding mode and more features.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+
+        It assumes that norm layer is used before activation.
+        """
+
+        # parse padding mode
+        self.padding_method = kwargs.pop("padding", None)
+        if self.padding_method is None:
+            if len(args) >= 5:
+                self.padding_method = args[4]
+            else:
+                self.padding_method = 0  # default padding number
+
+        if isinstance(self.padding_method, str):
+            if self.padding_method.upper() == "SAME":
+                # If the padding mode is `SAME`, it will be manually padded
+                super().__init__(*args, **kwargs, padding=0)
+                # stride
+                if isinstance(self.stride, int):
+                    self.stride = [self.stride] * 2
+                elif len(self.stride) == 1:
+                    self.stride = [self.stride[0]] * 2
+                # kernel size
+                if isinstance(self.kernel_size, int):
+                    self.kernel_size = [self.kernel_size] * 2
+                elif len(self.kernel_size) == 1:
+                    self.kernel_size = [self.kernel_size[0]] * 2
+                # dilation
+                if isinstance(self.dilation, int):
+                    self.dilation = [self.dilation] * 2
+                elif len(self.dilation) == 1:
+                    self.dilation = [self.dilation[0]] * 2
+            else:
+                raise ValueError("Unknown padding method: {}".format(self.padding_method))
+        else:
+            super().__init__(*args, **kwargs, padding=self.padding_method)
+
+    def forward(self, x):
+        if isinstance(self.padding_method, str):
+            if self.padding_method.upper() == "SAME":
+                input_h, input_w = x.shape[-2:]
+                stride_h, stride_w = self.stride
+                kernel_size_h, kernel_size_w = self.kernel_size
+                dilation_h, dilation_w = self.dilation
+
+                output_h = math.ceil(input_h / stride_h)
+                output_w = math.ceil(input_w / stride_w)
+
+                padding_needed_h = max(
+                    0, (output_h - 1) * stride_h + (kernel_size_h - 1) * dilation_h + 1 - input_h
+                )
+                padding_needed_w = max(
+                    0, (output_w - 1) * stride_w + (kernel_size_w - 1) * dilation_w + 1 - input_w
+                )
+
+                left = padding_needed_w // 2
+                right = padding_needed_w - left
+                top = padding_needed_h // 2
+                bottom = padding_needed_h - top
+
+                x = F.pad(x, [left, right, top, bottom])
+            else:
+                raise ValueError("Unknown padding method: {}".format(self.padding_method))
+
+        x = super().forward(x)
+
+        return x
+
+
+# Basic conv layer
+class Conv(nn.Module):
+    def __init__(self, 
+                 c1,                   # in channels
+                 c2,                   # out channels 
+                 k=1,                  # kernel size 
+                 p=0,                  # padding
+                 s=1,                  # padding
+                 d=1,                  # dilation
+                 act_type='',          # activation
+                 norm_type='',         # normalization
+                 padding_mode='ZERO',  # padding mode: "ZERO" or "SAME"
+                 depthwise=False):
+        super(Conv, self).__init__()
+        convs = []
+        add_bias = False if norm_type else True
+        if depthwise:
+            convs.append(get_conv2d(c1, c1, k=k, p=p, s=s, d=d, g=c1, padding_mode=padding_mode, bias=add_bias))
+            # depthwise conv
+            if norm_type:
+                convs.append(get_norm(norm_type, c1))
+            if act_type:
+                convs.append(get_activation(act_type))
+            # pointwise conv
+            convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias))
+            if norm_type:
+                convs.append(get_norm(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+
+        else:
+            convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=1, padding_mode=padding_mode, bias=add_bias))
+            if norm_type:
+                convs.append(get_norm(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+            
+        self.convs = nn.Sequential(*convs)
+
+
+    def forward(self, x):
+        return self.convs(x)
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_fpn.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fd8a8b254b830195f8c606ce084c0fbfc85c578
--- /dev/null
+++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_fpn.py
@@ -0,0 +1,252 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+try:
+    from yolo_free_basic import Conv
+except:
+    from .yolo_free_basic import Conv
+
+
+class ELANBlock(nn.Module):
+    """
+    ELAN BLock of YOLOv7's head
+    """
+    def __init__(self, in_dim, out_dim, fpn_size='large', depthwise=False, act_type='silu', norm_type='BN'):
+        super(ELANBlock, self).__init__()
+        if fpn_size == 'tiny' or fpn_size =='nano':
+            e1, e2 = 0.25, 1.0
+            width = 2
+            depth = 1
+        elif fpn_size == 'large':
+            e1, e2 = 0.5, 0.5
+            width = 4
+            depth = 1
+        elif fpn_size == 'huge':
+            e1, e2 = 0.5, 0.5
+            width = 4
+            depth = 2
+        inter_dim = int(in_dim * e1)
+        inter_dim2 = int(inter_dim * e2) 
+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.cv3 = nn.ModuleList()
+        for idx in range(width):
+            if idx == 0:
+                cvs = [Conv(inter_dim, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)]
+            else:
+                cvs = [Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise)]
+            # deeper
+            if depth > 1:
+                for _ in range(1, depth):
+                    cvs.append(Conv(inter_dim2, inter_dim2, k=3, p=1, act_type=act_type, norm_type=norm_type, depthwise=depthwise))
+                self.cv3.append(nn.Sequential(*cvs))
+            else:
+                self.cv3.append(cvs[0])
+
+        self.out = Conv(inter_dim*2+inter_dim2*len(self.cv3), out_dim, k=1, act_type=act_type, norm_type=norm_type)
+
+
+    def forward(self, x):
+        """
+        Input:
+            x: [B, C_in, H, W]
+        Output:
+            out: [B, C_out, H, W]
+        """
+        x1 = self.cv1(x)
+        x2 = self.cv2(x)
+        inter_outs = [x1, x2]
+        for m in self.cv3:
+            y1 = inter_outs[-1]
+            y2 = m(y1)
+            inter_outs.append(y2)
+
+        # [B, C_in, H, W] -> [B, C_out, H, W]
+        out = self.out(torch.cat(inter_outs, dim=1))
+
+        return out
+
+
+class DownSample(nn.Module):
+    def __init__(self, in_dim, depthwise=False, act_type='silu', norm_type='BN'):
+        super().__init__()
+        inter_dim = in_dim
+        self.mp = nn.MaxPool2d((2, 2), 2)
+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.cv2 = nn.Sequential(
+            Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type),
+            Conv(inter_dim, inter_dim, k=3, p=1, s=2, act_type=act_type, norm_type=norm_type, depthwise=depthwise)
+        )
+
+    def forward(self, x):
+        """
+        Input:
+            x: [B, C, H, W]
+        Output:
+            out: [B, 2C, H//2, W//2]
+        """
+        # [B, C, H, W] -> [B, C//2, H//2, W//2]
+        x1 = self.cv1(self.mp(x))
+        x2 = self.cv2(x)
+
+        # [B, C, H//2, W//2]
+        out = torch.cat([x1, x2], dim=1)
+
+        return out
+
+
+# PaFPN-ELAN
+class PaFPNELAN(nn.Module):
+    def __init__(self, 
+                 in_dims=[512, 1024, 1024],
+                 out_dim=256,
+                 fpn_size='large',
+                 depthwise=False,
+                 norm_type='BN',
+                 act_type='silu'):
+        super(PaFPNELAN, self).__init__()
+        self.in_dims = in_dims
+        self.out_dim = out_dim
+        c3, c4, c5 = in_dims
+        if fpn_size == 'tiny':
+            width = 0.5
+        elif fpn_size == 'nano':
+            assert depthwise
+            width = 0.5
+        elif fpn_size == 'large':
+            width = 1.0
+        elif fpn_size == 'huge':
+            width = 1.25
+
+        # top dwon
+        ## P5 -> P4
+        self.cv1 = Conv(c5, int(256 * width), k=1, norm_type=norm_type, act_type=act_type)
+        self.cv2 = Conv(c4, int(256 * width), k=1, norm_type=norm_type, act_type=act_type)
+        self.head_elan_1 = ELANBlock(in_dim=int(256 * width) + int(256 * width),
+                                     out_dim=int(256 * width),
+                                     fpn_size=fpn_size,
+                                     depthwise=depthwise,
+                                     norm_type=norm_type,
+                                     act_type=act_type)
+
+        # P4 -> P3
+        self.cv3 = Conv(int(256 * width), int(128 * width), k=1, norm_type=norm_type, act_type=act_type)
+        self.cv4 = Conv(c3, int(128 * width), k=1, norm_type=norm_type, act_type=act_type)
+        self.head_elan_2 = ELANBlock(in_dim=int(128 * width) + int(128 * width),
+                                     out_dim=int(128 * width),  # 128
+                                     fpn_size=fpn_size,
+                                     depthwise=depthwise,
+                                     norm_type=norm_type,
+                                     act_type=act_type)
+
+        # bottom up
+        # P3 -> P4
+        if fpn_size == 'large' or fpn_size == 'huge':
+            self.mp1 = DownSample(int(128 * width), act_type=act_type,
+                                  norm_type=norm_type, depthwise=depthwise)
+        elif fpn_size == 'tiny':
+            self.mp1 = Conv(int(128 * width), int(256 * width), k=3, p=1, s=2,
+                                act_type=act_type, norm_type=norm_type, depthwise=depthwise)
+        elif fpn_size == 'nano':
+            self.mp1 = nn.Sequential(
+                nn.MaxPool2d((2, 2), 2),
+                Conv(int(128 * width), int(256 * width), k=1, act_type=act_type, norm_type=norm_type)
+            )
+        self.head_elan_3 = ELANBlock(in_dim=int(256 * width) + int(256 * width),
+                                     out_dim=int(256 * width),  # 256
+                                     fpn_size=fpn_size,
+                                     depthwise=depthwise,
+                                     norm_type=norm_type,
+                                     act_type=act_type)
+
+        # P4 -> P5
+        if fpn_size == 'large' or fpn_size == 'huge':
+            self.mp2 = DownSample(int(256 * width), act_type=act_type,
+                                  norm_type=norm_type, depthwise=depthwise)
+        elif fpn_size == 'tiny':
+            self.mp2 = Conv(int(256 * width), int(512 * width), k=3, p=1, s=2,
+                                act_type=act_type, norm_type=norm_type, depthwise=depthwise)
+        elif fpn_size == 'nano':
+            self.mp2 = nn.Sequential(
+                nn.MaxPool2d((2, 2), 2),
+                Conv(int(256 * width), int(512 * width), k=1, act_type=act_type, norm_type=norm_type)
+            )
+        self.head_elan_4 = ELANBlock(in_dim=int(512 * width) + c5,
+                                     out_dim=int(512 * width),  # 512
+                                     fpn_size=fpn_size,
+                                     depthwise=depthwise,
+                                     norm_type=norm_type,
+                                     act_type=act_type)
+
+        self.head_conv_1 = Conv(int(128 * width), int(256 * width), k=3, p=1,
+                                act_type=act_type, norm_type=norm_type, depthwise=depthwise)
+        self.head_conv_2 = Conv(int(256 * width), int(512 * width), k=3, p=1,
+                                act_type=act_type, norm_type=norm_type, depthwise=depthwise)
+        self.head_conv_3 = Conv(int(512 * width), int(1024 * width), k=3, p=1,
+                                act_type=act_type, norm_type=norm_type, depthwise=depthwise)
+        # output proj layers
+        if self.out_dim is not None:
+            self.out_layers = nn.ModuleList([
+                Conv(in_dim, self.out_dim, k=1,
+                     norm_type=norm_type, act_type=act_type)
+                     for in_dim in [int(256 * width), int(512 * width), int(1024 * width)]
+                     ])
+
+
+    def forward(self, features):
+        c3, c4, c5 = features
+
+        # Top down
+        ## P5 -> P4
+        c6 = self.cv1(c5)
+        c7 = F.interpolate(c6, scale_factor=2.0)
+        c8 = torch.cat([c7, self.cv2(c4)], dim=1)
+        c9 = self.head_elan_1(c8)
+        ## P4 -> P3
+        c10 = self.cv3(c9)
+        c11 = F.interpolate(c10, scale_factor=2.0)
+        c12 = torch.cat([c11, self.cv4(c3)], dim=1)
+        c13 = self.head_elan_2(c12)
+
+        # Bottom up
+        # p3 -> P4
+        c14 = self.mp1(c13)
+        c15 = torch.cat([c14, c9], dim=1)
+        c16 = self.head_elan_3(c15)
+        # P4 -> P5
+        c17 = self.mp2(c16)
+        c18 = torch.cat([c17, c5], dim=1)
+        c19 = self.head_elan_4(c18)
+
+        c20 = self.head_conv_1(c13)
+        c21 = self.head_conv_2(c16)
+        c22 = self.head_conv_3(c19)
+
+        out_feats = [c20, c21, c22] # [P3, P4, P5]
+        
+        # output proj layers
+        if self.out_dim is not None:
+            out_feats_proj = []
+            for feat, layer in zip(out_feats, self.out_layers):
+                out_feats_proj.append(layer(feat))
+            return out_feats_proj
+
+        return out_feats
+
+
+def build_fpn(cfg, in_dims, out_dim):
+    model = cfg['fpn']
+    print('==============================')
+    print('FPN: {}'.format(model))
+    # build neck
+    if model == 'pafpn_elan':
+        fpn_net = PaFPNELAN(in_dims=in_dims,
+                            out_dim=out_dim,
+                            fpn_size=cfg['fpn_size'],
+                            depthwise=cfg['fpn_depthwise'],
+                            norm_type=cfg['fpn_norm'],
+                            act_type=cfg['fpn_act'])
+                                                        
+
+    return fpn_net
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_head.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..34028d51fc5bb9c89830d628bed958e7a923a0fe
--- /dev/null
+++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_head.py
@@ -0,0 +1,51 @@
+import torch
+import torch.nn as nn
+
+try:
+    from yolo_free_basic import Conv
+except:
+    from .yolo_free_basic import Conv
+
+
+class DecoupledHead(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        print('==============================')
+        print('Head: Decoupled Head')
+        self.num_cls_head=cfg['num_cls_head']
+        self.num_reg_head=cfg['num_reg_head']
+        self.act_type=cfg['head_act']
+        self.norm_type=cfg['head_norm']
+        self.head_dim = cfg['head_dim']
+
+        self.cls_feats = nn.Sequential(*[Conv(self.head_dim, 
+                                              self.head_dim, 
+                                              k=3, p=1, s=1, 
+                                              act_type=self.act_type, 
+                                              norm_type=self.norm_type,
+                                              depthwise=cfg['head_depthwise']) for _ in range(self.num_cls_head)])
+        self.reg_feats = nn.Sequential(*[Conv(self.head_dim, 
+                                              self.head_dim, 
+                                              k=3, p=1, s=1, 
+                                              act_type=self.act_type, 
+                                              norm_type=self.norm_type,
+                                              depthwise=cfg['head_depthwise']) for _ in range(self.num_reg_head)])
+
+
+    def forward(self, x):
+        """
+            in_feats: (Tensor) [B, C, H, W]
+        """
+        cls_feats = self.cls_feats(x)
+        reg_feats = self.reg_feats(x)
+
+        return cls_feats, reg_feats
+
+
+# build detection head
+def build_head(cfg):
+    head = DecoupledHead(cfg) 
+
+    return head
+    
\ No newline at end of file
diff --git a/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_neck.py b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..79886233d9532bade2d1ca96af31fb96176ddc26
--- /dev/null
+++ b/models/backbone/backbone_2d/cnn_2d/yolo_free/yolo_free_neck.py
@@ -0,0 +1,164 @@
+import torch
+import torch.nn as nn
+
+try:
+    from yolo_free_basic import Conv
+except:
+    from .yolo_free_basic import Conv
+
+
+# Spatial Pyramid Pooling
+class SPP(nn.Module):
+    """
+        Spatial Pyramid Pooling
+    """
+    def __init__(self, in_dim, out_dim, expand_ratio=0.5, pooling_size=[5, 9, 13], norm_type='BN', act_type='relu'):
+        super(SPP, self).__init__()
+        inter_dim = int(in_dim * expand_ratio)
+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.m = nn.ModuleList(
+            [
+                nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+                for k in pooling_size
+            ]
+        )
+        
+        self.cv2 = Conv(inter_dim*(len(pooling_size) + 1), out_dim, k=1, act_type=act_type, norm_type=norm_type)
+
+    def forward(self, x):
+        x = self.cv1(x)
+        x = torch.cat([x] + [m(x) for m in self.m], dim=1)
+        x = self.cv2(x)
+
+        return x
+
+
+# SPP block with CSP module
+class SPPBlock(nn.Module):
+    """
+        Spatial Pyramid Pooling Block
+    """
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 expand_ratio=0.5,
+                 pooling_size=[5, 9, 13],
+                 act_type='lrelu',
+                 norm_type='BN',
+                 depthwise=False
+                 ):
+        super(SPPBlockCSP, self).__init__()
+        inter_dim = int(in_dim * expand_ratio)
+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.cv2 = nn.Sequential(
+            SPP(inter_dim, 
+                inter_dim, 
+                expand_ratio=1.0, 
+                pooling_size=pooling_size, 
+                act_type=act_type, 
+                norm_type=norm_type),
+        )
+        self.cv3 = Conv(inter_dim * 2, out_dim, k=1, act_type=act_type, norm_type=norm_type)
+
+        
+    def forward(self, x):
+        x1 = self.cv1(x)
+        x2 = self.cv2(x)
+        y = self.cv3(torch.cat([x1, x2], dim=1))
+
+        return y
+
+
+# SPP block with CSP module
+class SPPBlockCSP(nn.Module):
+    """
+        CSP Spatial Pyramid Pooling Block
+    """
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 expand_ratio=0.5,
+                 pooling_size=[5, 9, 13],
+                 act_type='lrelu',
+                 norm_type='BN',
+                 depthwise=False
+                 ):
+        super(SPPBlockCSP, self).__init__()
+        inter_dim = int(in_dim * expand_ratio)
+        self.cv1 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.cv2 = Conv(in_dim, inter_dim, k=1, act_type=act_type, norm_type=norm_type)
+        self.m = nn.Sequential(
+            Conv(inter_dim, inter_dim, k=3, p=1, 
+                 act_type=act_type, norm_type=norm_type, 
+                 depthwise=depthwise),
+            SPP(inter_dim, 
+                inter_dim, 
+                expand_ratio=1.0, 
+                pooling_size=pooling_size, 
+                act_type=act_type, 
+                norm_type=norm_type),
+            Conv(inter_dim, inter_dim, k=3, p=1, 
+                 act_type=act_type, norm_type=norm_type, 
+                 depthwise=depthwise)
+        )
+        self.cv3 = Conv(inter_dim * 2, out_dim, k=1, act_type=act_type, norm_type=norm_type)
+
+        
+    def forward(self, x):
+        x1 = self.cv1(x)
+        x2 = self.cv2(x)
+        x3 = self.m(x2)
+        y = self.cv3(torch.cat([x1, x3], dim=1))
+
+        return y
+
+
+# Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
+class SPPF(nn.Module):
+    def __init__(self, in_dim, out_dim, k=5):  # equivalent to SPP(k=(5, 9, 13))
+        super().__init__()
+        inter_dim = in_dim // 2  # hidden channels
+        self.cv1 = Conv(in_dim, inter_dim, k=1)
+        self.cv2 = Conv(inter_dim * 4, out_dim, k=1)
+        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+
+    def forward(self, x):
+        x = self.cv1(x)
+        y1 = self.m(x)
+        y2 = self.m(y1)
+
+        return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
+
+
+def build_neck(cfg, in_dim, out_dim):
+    model = cfg['neck']
+    # build neck
+    if model == 'spp_block':
+        neck = SPPBlock(
+            in_dim, out_dim, 
+            expand_ratio=cfg['expand_ratio'], 
+            pooling_size=cfg['pooling_size'],
+            act_type=cfg['neck_act'],
+            norm_type=cfg['neck_norm'],
+            depthwise=cfg['neck_depthwise']
+            )
+            
+    elif model == 'spp_block_csp':
+        neck = SPPBlockCSP(
+            in_dim, out_dim, 
+            expand_ratio=cfg['expand_ratio'], 
+            pooling_size=cfg['pooling_size'],
+            act_type=cfg['neck_act'],
+            norm_type=cfg['neck_norm'],
+            depthwise=cfg['neck_depthwise']
+            )
+
+    elif model == 'sppf':
+        neck = SPPF(in_dim, out_dim, k=cfg['pooling_size'])
+
+
+    return neck
+
+
+if __name__ == '__main__':
+    pass
diff --git a/models/backbone/backbone_3d/__init__.py b/models/backbone/backbone_3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/backbone/backbone_3d/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_3d/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6aa230eb1b1c3c542f9617d600544184bf3e9a02
Binary files /dev/null and b/models/backbone/backbone_3d/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/backbone/backbone_3d/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_3d/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d348ed59c03ca2f80c56eff69fc611b851b62b2c
Binary files /dev/null and b/models/backbone/backbone_3d/__pycache__/__init__.cpython-37.pyc differ
diff --git a/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-310.pyc b/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf7c1ab3b275b06a8e6685679b90e3fcd62a893a
Binary files /dev/null and b/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-310.pyc differ
diff --git a/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-37.pyc b/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9632b255c13e207b9714f8422649b24cf487ae39
Binary files /dev/null and b/models/backbone/backbone_3d/__pycache__/backbone_3d.cpython-37.pyc differ
diff --git a/models/backbone/backbone_3d/backbone_3d.py b/models/backbone/backbone_3d/backbone_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b2e489d663123b6302df84866e1e7ace5d1d61c
--- /dev/null
+++ b/models/backbone/backbone_3d/backbone_3d.py
@@ -0,0 +1,68 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .cnn_3d import build_3d_cnn
+
+
+class Conv(nn.Module):
+    def __init__(self, in_dim, out_dim, k=3, p=1, s=1, depthwise=False):
+        super().__init__()
+        if depthwise:
+            self.convs = nn.Sequential(
+                nn.Conv2d(in_dim, in_dim, kernel_size=k, padding=p, stride=s, groups=in_dim, bias=False),
+                nn.BatchNorm2d(out_dim),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(in_dim, out_dim, kernel_size=1, groups=in_dim, bias=False),
+                nn.BatchNorm2d(out_dim),
+                nn.ReLU(inplace=True),
+            )
+        else:
+            self.convs = nn.Sequential(
+                nn.Conv2d(in_dim, out_dim, kernel_size=k, padding=p, stride=s, bias=False),
+                nn.BatchNorm2d(out_dim),
+                nn.ReLU(inplace=True)
+            )
+
+    def forward(self, x):
+        return self.convs(x)
+    
+
+class ConvBlocks(nn.Module):
+    def __init__(self, in_dim, out_dim, nblocks=1, depthwise=False):
+        super().__init__()
+        assert in_dim == out_dim
+
+        conv_block = []
+        for _ in range(nblocks):
+            conv_block.append(
+                Conv(in_dim, out_dim, k=3, p=1, s=1, depthwise=depthwise)
+            )
+        self.conv_block = nn.Sequential(*conv_block)
+
+    def forward(self, x):
+        return self.conv_block(x)
+    
+
+class Backbone3D(nn.Module):
+    def __init__(self, cfg, pretrained=False):
+        super().__init__()
+        self.cfg = cfg
+
+        # 3D CNN
+        self.backbone, self.feat_dim = build_3d_cnn(cfg, pretrained)
+        
+       
+    def forward(self, x):
+        """
+            Input:
+                x: (Tensor) -> [B, C, T, H, W]
+            Output:
+                y: (List) -> [
+                    (Tensor) -> [B, C1, H1, W1],
+                    (Tensor) -> [B, C2, H2, W2],
+                    (Tensor) -> [B, C3, H3, W3]
+                    ]
+        """
+        feat = self.backbone(x)
+
+        return feat
diff --git a/models/backbone/backbone_3d/cnn_3d/__init__.py b/models/backbone/backbone_3d/cnn_3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..614878066212084d373f72e3b7ecfdaec30df8cc
--- /dev/null
+++ b/models/backbone/backbone_3d/cnn_3d/__init__.py
@@ -0,0 +1,30 @@
+from .resnet import build_resnet_3d
+from .resnext import build_resnext_3d
+from .shufflnetv2 import build_shufflenetv2_3d
+
+
+def build_3d_cnn(cfg, pretrained=False):
+    print('==============================')
+    print('3D Backbone: {}'.format(cfg['backbone_3d'].upper()))
+    print('--pretrained: {}'.format(pretrained))
+
+    if 'resnet' in cfg['backbone_3d']:
+        model, feat_dims = build_resnet_3d(
+            model_name=cfg['backbone_3d'],
+            pretrained=pretrained
+            )
+    elif 'resnext' in cfg['backbone_3d']:
+        model, feat_dims = build_resnext_3d(
+            model_name=cfg['backbone_3d'],
+            pretrained=pretrained
+            )
+    elif 'shufflenetv2' in cfg['backbone_3d']:
+        model, feat_dims = build_shufflenetv2_3d(
+            model_size=cfg['model_size'],
+            pretrained=pretrained
+            )
+    else:
+        print('Unknown Backbone ...')
+        exit()
+
+    return model, feat_dims
diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-310.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..723162398332cc110fc6b3ef4915a967c92a8c45
Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-37.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46754a1bf132221c71513dc9adabb8c05fa7d9d6
Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/__init__.cpython-37.pyc differ
diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-310.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbf640180bf32c6ded3f274e56f0a13abe9633fb
Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-310.pyc differ
diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-37.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e67e1df865964dc508ce518c217b151fa1b5455
Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnet.cpython-37.pyc differ
diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-310.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bf6e68d2684421b980f1e5dcf558f2332f82768
Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-310.pyc differ
diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-37.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41d9ac061d5e655de8843cbbb485514d31d590a5
Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/resnext.cpython-37.pyc differ
diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-310.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62609b79498b2b3f6c481c09b6feba58e473fe07
Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-310.pyc differ
diff --git a/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-37.pyc b/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..072c02b8f79f1ac6c818bc623f9cc85b203fc61c
Binary files /dev/null and b/models/backbone/backbone_3d/cnn_3d/__pycache__/shufflnetv2.cpython-37.pyc differ
diff --git a/models/backbone/backbone_3d/cnn_3d/resnet.py b/models/backbone/backbone_3d/cnn_3d/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..81d28a68bca418e017ac553757eecbbfe8a93a08
--- /dev/null
+++ b/models/backbone/backbone_3d/cnn_3d/resnet.py
@@ -0,0 +1,309 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torch.hub import load_state_dict_from_url
+from functools import partial
+
+__all__ = [
+    'ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+    'resnet152', 'resnet200'
+]
+
+
+model_urls = {
+    "resnet18": "https://github.com/yjh0410/YOWOF/releases/download/yowof-weight/resnet-18-kinetics.pth",
+    "resnet34": "https://github.com/yjh0410/YOWOF/releases/download/yowof-weight/resnet-34-kinetics.pth",
+    "resnet50": "https://github.com/yjh0410/YOWOF/releases/download/yowof-weight/resnet-50-kinetics.pth",
+    "resnet101": "https://github.com/yjh0410/YOWOF/releases/download/yowof-weight/resnet-101-kinetics.pth"
+}
+
+
+
+def conv3x3x3(in_planes, out_planes, stride=1):
+    # 3x3x3 convolution with padding
+    return nn.Conv3d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+def downsample_basic_block(x, planes, stride):
+    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
+    zero_pads = torch.Tensor(
+        out.size(0), planes - out.size(1), out.size(2), out.size(3),
+        out.size(4)).zero_()
+
+    if isinstance(out.data, torch.cuda.FloatTensor):
+        zero_pads = zero_pads.cuda()
+    zero_pads = zero_pads.to(out.data.device)
+    out = Variable(torch.cat([out.data, zero_pads], dim=1))
+
+    return out
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm3d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3x3(planes, planes)
+        self.bn2 = nn.BatchNorm3d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm3d(planes)
+        self.conv2 = nn.Conv3d(
+            planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm3d(planes)
+        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm3d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 shortcut_type='B'):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv3d(
+            3,
+            64,
+            kernel_size=7,
+            stride=(1, 2, 2),
+            padding=(3, 3, 3),
+            bias=False)
+        self.bn1 = nn.BatchNorm3d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], shortcut_type, stride=2)
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], shortcut_type, stride=2)
+        self.layer4 = self._make_layer(
+            block, 512, layers[3], shortcut_type, stride=2)
+        # self.avgpool = nn.AvgPool3d((2, 1, 1), stride=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            elif isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if shortcut_type == 'A':
+                downsample = partial(
+                    downsample_basic_block,
+                    planes=planes * block.expansion,
+                    stride=stride)
+            else:
+                downsample = nn.Sequential(
+                    nn.Conv3d(
+                        self.inplanes,
+                        planes * block.expansion,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=False), nn.BatchNorm3d(planes * block.expansion))
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        c1 = self.conv1(x)
+        c1 = self.bn1(c1)
+        c1 = self.relu(c1)
+        c2 = self.maxpool(c1)
+
+        c2 = self.layer1(c2)
+        c3 = self.layer2(c2)
+        c4 = self.layer3(c3)
+        c5 = self.layer4(c4)
+        
+        if c5.size(2) > 1:
+            c5 = torch.mean(c5, dim=2, keepdim=True)
+        
+        return c5.squeeze(2)
+
+
+def load_weight(model, arch):
+    print('Loading pretrained weight ...')
+    url = model_urls[arch]
+    # check
+    if url is None:
+        print('No pretrained weight for 3D CNN: {}'.format(arch.upper()))
+        return model
+
+    print('Loading 3D backbone pretrained weight: {}'.format(arch.upper()))
+    # checkpoint state dict
+    checkpoint = load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
+    checkpoint_state_dict = checkpoint.pop('state_dict')
+
+    # model state dict
+    model_state_dict = model.state_dict()
+    # reformat checkpoint_state_dict:
+    new_state_dict = {}
+    for k in checkpoint_state_dict.keys():
+        v = checkpoint_state_dict[k]
+        new_state_dict[k[7:]] = v
+
+    # check
+    for k in list(new_state_dict.keys()):
+        if k in model_state_dict:
+            shape_model = tuple(model_state_dict[k].shape)
+            shape_checkpoint = tuple(new_state_dict[k].shape)
+            if shape_model != shape_checkpoint:
+                new_state_dict.pop(k)
+                # print(k)
+        else:
+            new_state_dict.pop(k)
+            # print(k)
+
+    model.load_state_dict(new_state_dict)
+        
+    return model
+
+
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a 3D ResNet-18 model."""
+
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+
+    if pretrained:
+        model = load_weight(model, 'resnet18')
+
+    return model
+
+
+def resnet34(pretrained=False, **kwargs):
+    """Constructs a 3D ResNet-34 model."""
+
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+
+    if pretrained:
+        model = load_weight(model, 'resnet34')
+
+    return model
+
+
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a 3D ResNet-50 model. """
+
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+
+    if pretrained:
+        model = load_weight(model, 'resnet50')
+
+    return model
+
+
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a 3D ResNet-101 model."""
+
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+
+    if pretrained:
+        model = load_weight(model, 'resnet101')
+
+    return model
+
+
+# build 3D resnet
+def build_resnet_3d(model_name='resnet18', pretrained=False):
+    if model_name == 'resnet18':
+        model = resnet18(pretrained=pretrained, shortcut_type='A')
+        feats = 512
+
+    elif model_name == 'resnet50':
+        model = resnet50(pretrained=pretrained, shortcut_type='B')
+        feats = 2048
+
+    elif model_name == 'resnet101':
+        model = resnet101(pretrained=pretrained, shortcut_type='b')
+        feats = 2048
+
+    return model, feats
+
+
+if __name__ == '__main__':
+    import time
+    model, feats = build_resnet_3d(model_name='resnet18', pretrained=True)
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    model = model.to(device)
+
+    x = torch.randn(1, 3, 16, 64, 64).to(device)
+    # star time
+    t0 = time.time()
+    out = model(x)
+    print('time', time.time() - t0)
+
+    print(out.shape)
diff --git a/models/backbone/backbone_3d/cnn_3d/resnext.py b/models/backbone/backbone_3d/cnn_3d/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f0c57081af9139dfb1fef32101d3e386d5989c1
--- /dev/null
+++ b/models/backbone/backbone_3d/cnn_3d/resnext.py
@@ -0,0 +1,286 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torch.hub import load_state_dict_from_url
+from functools import partial
+
+__all__ = ['resnext50', 'resnext101', 'resnet152']
+
+
+model_urls = {
+    "resnext50": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/resnext-50-kinetics.pth",
+    "resnext101": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/resnext-101-kinetics.pth",
+    "resnext152": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/resnext-152-kinetics.pth"
+}
+
+
+
+def downsample_basic_block(x, planes, stride):
+    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
+    zero_pads = torch.Tensor(
+        out.size(0), planes - out.size(1), out.size(2), out.size(3),
+        out.size(4)).zero_()
+
+    if isinstance(out.data, torch.cuda.FloatTensor):
+        zero_pads = zero_pads.cuda()
+    zero_pads = zero_pads.to(out.data.device)
+    out = Variable(torch.cat([out.data, zero_pads], dim=1))
+
+    return out
+
+
+class ResNeXtBottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, cardinality, stride=1,
+                 downsample=None):
+        super(ResNeXtBottleneck, self).__init__()
+        mid_planes = cardinality * int(planes / 32)
+        self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm3d(mid_planes)
+        self.conv2 = nn.Conv3d(
+            mid_planes,
+            mid_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=cardinality,
+            bias=False)
+        self.bn2 = nn.BatchNorm3d(mid_planes)
+        self.conv3 = nn.Conv3d(
+            mid_planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm3d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNeXt(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 shortcut_type='B',
+                 cardinality=32):
+        self.inplanes = 64
+        super(ResNeXt, self).__init__()
+        self.conv1 = nn.Conv3d(
+            3,
+            64,
+            kernel_size=7,
+            stride=(1, 2, 2),
+            padding=(3, 3, 3),
+            bias=False)
+        self.bn1 = nn.BatchNorm3d(64)
+        self.relu = nn.ReLU(inplace=True)
+        
+        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
+        
+        self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type,
+                                       cardinality)
+        
+        self.layer2 = self._make_layer(
+            block, 256, layers[1], shortcut_type, cardinality, stride=2)
+        
+        self.layer3 = self._make_layer(
+            block, 512, layers[2], shortcut_type, cardinality, stride=2)
+        
+        self.layer4 = self._make_layer(
+            block, 1024, layers[3], shortcut_type, cardinality, stride=2)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv3d):
+                m.weight = nn.init.kaiming_normal_(m.weight, mode='fan_out')
+            elif isinstance(m, nn.BatchNorm3d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self,
+                    block,
+                    planes,
+                    blocks,
+                    shortcut_type,
+                    cardinality,
+                    stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            if shortcut_type == 'A':
+                downsample = partial(
+                    downsample_basic_block,
+                    planes=planes * block.expansion,
+                    stride=stride)
+            else:
+                downsample = nn.Sequential(
+                    nn.Conv3d(
+                        self.inplanes,
+                        planes * block.expansion,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=False), nn.BatchNorm3d(planes * block.expansion))
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, cardinality, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, cardinality))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        c1 = self.conv1(x)
+        c1 = self.bn1(c1)
+        c1 = self.relu(c1)
+        c2 = self.maxpool(c1)
+
+        c2 = self.layer1(c2)
+        c3 = self.layer2(c2)
+        c4 = self.layer3(c3)
+        c5 = self.layer4(c4)
+        #fix
+        #if c5.size(2) > 1:
+        if c5.size(2) > 1:
+            c5 = torch.mean(c5, dim=2, keepdim=True)
+        
+        return c5.squeeze(2)
+
+
+def load_weight(model, arch):
+    print('Loading pretrained weight ...')
+    url = model_urls[arch]
+    # check
+    if url is None:
+        print('No pretrained weight for 3D CNN: {}'.format(arch.upper()))
+        return model
+        
+    print('Loading 3D backbone pretrained weight: {}'.format(arch.upper()))
+    # checkpoint state dict
+    checkpoint = load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
+    checkpoint_state_dict = checkpoint.pop('state_dict')
+
+    # model state dict
+    model_state_dict = model.state_dict()
+    # reformat checkpoint_state_dict:
+    new_state_dict = {}
+    for k in checkpoint_state_dict.keys():
+        v = checkpoint_state_dict[k]
+        new_state_dict[k[7:]] = v
+
+    # check
+    for k in list(new_state_dict.keys()):
+        if k in model_state_dict:
+            shape_model = tuple(model_state_dict[k].shape)
+            shape_checkpoint = tuple(new_state_dict[k].shape)
+            if shape_model != shape_checkpoint:
+                new_state_dict.pop(k)
+                # print(k)
+        else:
+            new_state_dict.pop(k)
+            # print(k)
+
+    model.load_state_dict(new_state_dict)
+        
+    return model
+
+
+def resnext50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+    """
+    model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs)
+
+    if pretrained:
+        model = load_weight(model, 'resnext50')
+
+    return model
+
+
+def resnext101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs)
+
+    if pretrained:
+        model = load_weight(model, 'resnext101')
+
+    return model
+
+
+def resnext152(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    """
+    model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs)
+
+    if pretrained:
+        model = load_weight(model, 'resnext152')
+
+    return model
+
+
+# build 3D resnet
+def build_resnext_3d(model_name='resnext101', pretrained=True):
+    if model_name == 'resnext50':
+        model = resnext50(pretrained=pretrained)
+        feats = 2048
+
+    elif model_name == 'resnext101':
+        model = resnext101(pretrained=pretrained)
+        feats = 2048
+
+    elif model_name == 'resnext152':
+        model = resnext152(pretrained=pretrained)
+        feats = 2048
+
+    return model, feats
+
+
+if __name__ == '__main__':
+    import time
+    from thop import profile
+
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+        
+    model, feats = build_resnext_3d(model_name='resnext50', pretrained=False)
+    model = model.to(device)
+
+    x = torch.randn(1, 3, 32, 256, 256).to(device)
+    # star time
+    t0 = time.time()
+    # inference
+    outs = model(x)
+    for y in outs:
+        print(y.shape)
+    # end time
+    print('Inference time: {}'.format(time.time() - t0))
+
+    # FLOPs & Params
+    print('==============================')
+    flops, params = profile(model, inputs=(x, ), verbose=False)
+    print('==============================')
+    print('GFLOPs : {:.2f}'.format(flops / 1e9))
+    print('Params : {:.2f} M'.format(params / 1e6))
diff --git a/models/backbone/backbone_3d/cnn_3d/shufflnetv2.py b/models/backbone/backbone_3d/cnn_3d/shufflnetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fa682318f649aac5ec679f949a9fde35a47a3cc
--- /dev/null
+++ b/models/backbone/backbone_3d/cnn_3d/shufflnetv2.py
@@ -0,0 +1,236 @@
+'''ShuffleNetV2 in PyTorch.
+
+See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details.
+'''
+
+import torch
+import torch.nn as nn
+from torch.hub import load_state_dict_from_url
+
+
+__all__ = ['resnext50', 'resnext101', 'resnet152']
+
+
+model_urls = {
+    "0.25x": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/kinetics_shufflenetv2_0.25x_RGB_16_best.pth",
+    "1.0x": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/kinetics_shufflenetv2_1.0x_RGB_16_best.pth",
+    "1.5x": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/kinetics_shufflenetv2_1.5x_RGB_16_best.pth",
+    "2.0x": "https://github.com/yjh0410/PyTorch_YOWO/releases/download/yowo-weight/kinetics_shufflenetv2_2.0x_RGB_16_best.pth",
+}
+
+
+# basic component
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv3d(inp, oup, kernel_size=3, stride=stride, padding=(1,1,1), bias=False),
+        nn.BatchNorm3d(oup),
+        nn.ReLU(inplace=True)
+    )
+
+
+def conv_1x1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv3d(inp, oup, 1, 1, 0, bias=False),
+        nn.BatchNorm3d(oup),
+        nn.ReLU(inplace=True)
+    )
+
+
+def channel_shuffle(x, groups):
+    '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]'''
+    batchsize, num_channels, depth, height, width = x.data.size()
+    channels_per_group = num_channels // groups
+    # reshape
+    x = x.view(batchsize, groups, 
+        channels_per_group, depth, height, width)
+    #permute
+    x = x.permute(0,2,1,3,4,5).contiguous()
+    # flatten
+    x = x.view(batchsize, num_channels, depth, height, width)
+    return x
+    
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        oup_inc = oup//2
+        
+        if self.stride == 1:
+            self.banch2 = nn.Sequential(
+                # pw
+                nn.Conv3d(oup_inc, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm3d(oup_inc),
+                nn.ReLU(inplace=True),
+                # dw
+                nn.Conv3d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
+                nn.BatchNorm3d(oup_inc),
+                # pw-linear
+                nn.Conv3d(oup_inc, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm3d(oup_inc),
+                nn.ReLU(inplace=True)
+            )
+        
+        else:
+            self.banch1 = nn.Sequential(
+                # dw
+                nn.Conv3d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                nn.BatchNorm3d(inp),
+                # pw-linear
+                nn.Conv3d(inp, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm3d(oup_inc),
+                nn.ReLU(inplace=True)
+            )
+            self.banch2 = nn.Sequential(
+                # pw
+                nn.Conv3d(inp, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm3d(oup_inc),
+                nn.ReLU(inplace=True),
+                # dw
+                nn.Conv3d(oup_inc, oup_inc, 3, stride, 1, groups=oup_inc, bias=False),
+                nn.BatchNorm3d(oup_inc),
+                # pw-linear
+                nn.Conv3d(oup_inc, oup_inc, 1, 1, 0, bias=False),
+                nn.BatchNorm3d(oup_inc),
+                nn.ReLU(inplace=True)
+            )
+
+
+    @staticmethod
+    def _concat(x, out):
+        # concatenate along channel axis
+        return torch.cat((x, out), 1)        
+
+
+    def forward(self, x):
+        if self.stride == 1:
+            x1 = x[:, :(x.shape[1]//2), :, :, :]
+            x2 = x[:, (x.shape[1]//2):, :, :, :]
+            out = self._concat(x1, self.banch2(x2))
+        elif self.stride == 2:
+            out = self._concat(self.banch1(x), self.banch2(x))
+
+        return channel_shuffle(out, 2)
+
+
+# ShuffleNet-v2
+class ShuffleNetV2(nn.Module):
+    def __init__(self, width_mult='1.0x', num_classes=600):
+        super(ShuffleNetV2, self).__init__()
+        
+        self.stage_repeats = [4, 8, 4]
+        # index 0 is invalid and should never be called.
+        # only used for indexing convenience.
+        if width_mult == '0.25x':
+            self.stage_out_channels = [-1, 24,  32,  64, 128]
+        elif width_mult == '0.5x':
+            self.stage_out_channels = [-1, 24,  48,  96, 192]
+        elif width_mult == '1.0x':
+            self.stage_out_channels = [-1, 24, 116, 232, 464]
+        elif width_mult == '1.5x':
+            self.stage_out_channels = [-1, 24, 176, 352, 704]
+        elif width_mult == '2.0x':
+            self.stage_out_channels = [-1, 24, 224, 488, 976]
+
+        # building first layer
+        input_channel = self.stage_out_channels[1]
+        self.conv1 = conv_bn(3, input_channel, stride=(1,2,2))
+        self.maxpool = nn.MaxPool3d(kernel_size=3, stride=2, padding=1)
+        
+        self.features = []
+        # building inverted residual blocks
+        for idxstage in range(len(self.stage_repeats)):
+            numrepeat = self.stage_repeats[idxstage]
+            output_channel = self.stage_out_channels[idxstage+2]
+            for i in range(numrepeat):
+                stride = 2 if i == 0 else 1
+                self.features.append(InvertedResidual(input_channel, output_channel, stride))
+                input_channel = output_channel
+                
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+
+        # # building last several layers
+        # self.conv_last      = conv_1x1x1_bn(input_channel, self.stage_out_channels[-1])
+        # self.avgpool        = nn.AvgPool3d((2, 1, 1), stride=1)
+    
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool(x)
+        x = self.features(x)
+        # out = self.conv_last(out) 
+
+        if x.size(2) > 1:
+            x = torch.mean(x, dim=2, keepdim=True)
+        
+        return x.squeeze(2)
+
+
+def load_weight(model, arch):
+    print('Loading pretrained weight ...')
+    url = model_urls[arch]
+    # check
+    if url is None:
+        print('No pretrained weight for 3D CNN: {}'.format(arch.upper()))
+        return model
+
+    print('Loading 3D backbone pretrained weight: {}'.format(arch.upper()))
+    # checkpoint state dict
+    checkpoint = load_state_dict_from_url(url=url, map_location="cpu", check_hash=True)
+    checkpoint_state_dict = checkpoint.pop('state_dict')
+
+    # model state dict
+    model_state_dict = model.state_dict()
+    # reformat checkpoint_state_dict:
+    new_state_dict = {}
+    for k in checkpoint_state_dict.keys():
+        v = checkpoint_state_dict[k]
+        new_state_dict[k[7:]] = v
+
+    # check
+    for k in list(new_state_dict.keys()):
+        if k in model_state_dict:
+            shape_model = tuple(model_state_dict[k].shape)
+            shape_checkpoint = tuple(new_state_dict[k].shape)
+            if shape_model != shape_checkpoint:
+                new_state_dict.pop(k)
+                print(k)
+        else:
+            new_state_dict.pop(k)
+            print(k)
+
+    model.load_state_dict(new_state_dict)
+        
+    return model
+
+
+# build 3D shufflenet_v2
+def build_shufflenetv2_3d(model_size='0.25x', pretrained=False):
+    model = ShuffleNetV2(model_size)
+    feats = model.stage_out_channels[-1]
+
+    if pretrained:
+        model = load_weight(model, model_size)
+
+    return model, feats
+
+
+if __name__ == '__main__':
+    import time
+    model, feat = build_shufflenetv2_3d(model_size='1.0x', pretrained=True)
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    model = model.to(device)
+
+    # [B, C, T, H, W]
+    x = torch.randn(1, 3, 16, 64, 64).to(device)
+    # star time
+    t0 = time.time()
+    out = model(x)
+    print('time', time.time() - t0)
+    print(out.shape)
diff --git a/models/basic/__init__.py b/models/basic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..14f866567fda7c53083266f1e7c2a9a74ab9dbb6
--- /dev/null
+++ b/models/basic/__init__.py
@@ -0,0 +1 @@
+from .conv import Conv2d
\ No newline at end of file
diff --git a/models/basic/__pycache__/__init__.cpython-310.pyc b/models/basic/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32bb10569406e5bd8d32b3a5e21106d96a376d04
Binary files /dev/null and b/models/basic/__pycache__/__init__.cpython-310.pyc differ
diff --git a/models/basic/__pycache__/__init__.cpython-37.pyc b/models/basic/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8ad20d512de7feac15e88d606294367eb341bbe
Binary files /dev/null and b/models/basic/__pycache__/__init__.cpython-37.pyc differ
diff --git a/models/basic/__pycache__/conv.cpython-310.pyc b/models/basic/__pycache__/conv.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8d4b932f0e09bcc54e879b7f9cfb6906a22a2bb
Binary files /dev/null and b/models/basic/__pycache__/conv.cpython-310.pyc differ
diff --git a/models/basic/__pycache__/conv.cpython-37.pyc b/models/basic/__pycache__/conv.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5817a8431c782284d6b2878fe7814d0ec8ab033c
Binary files /dev/null and b/models/basic/__pycache__/conv.cpython-37.pyc differ
diff --git a/models/basic/conv.py b/models/basic/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..00e986cf98b73d96c45be7dd3ab1eafca8fb504f
--- /dev/null
+++ b/models/basic/conv.py
@@ -0,0 +1,127 @@
+import torch.nn as nn
+
+
+def get_activation(act_type=None):
+    if act_type == 'relu':
+        return nn.ReLU(inplace=True)
+    elif act_type == 'lrelu':
+        return nn.LeakyReLU(0.1, inplace=True)
+    elif act_type == 'mish':
+        return nn.Mish(inplace=True)
+    elif act_type == 'silu':
+        return nn.SiLU(inplace=True)
+
+
+# 2D Conv
+def get_conv2d(c1, c2, k, p, s, d, g, bias=False):
+    conv = nn.Conv2d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
+    return conv
+
+
+def get_norm2d(norm_type, dim):
+    if norm_type == 'BN':
+        return nn.BatchNorm2d(dim)
+    elif norm_type == 'IN':
+        return nn.InstanceNorm2d(dim)
+
+
+class Conv2d(nn.Module):
+    def __init__(self, 
+                 c1,                   # in channels
+                 c2,                   # out channels 
+                 k=1,                  # kernel size 
+                 p=0,                  # padding
+                 s=1,                  # padding
+                 d=1,                  # dilation
+                 g=1,
+                 act_type='',          # activation
+                 norm_type='',         # normalization
+                 depthwise=False):
+        super(Conv2d, self).__init__()
+        convs = []
+        add_bias = False if norm_type else True
+        if depthwise:
+            assert c1 == c2, "In depthwise conv, the in_dim (c1) should be equal to out_dim (c2)."
+            convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=c1, bias=add_bias))
+            # depthwise conv
+            if norm_type:
+                convs.append(get_norm2d(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+            # pointwise conv
+            convs.append(get_conv2d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias))
+            if norm_type:
+                convs.append(get_norm2d(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+
+        else:
+            convs.append(get_conv2d(c1, c2, k=k, p=p, s=s, d=d, g=g, bias=add_bias))
+            if norm_type:
+                convs.append(get_norm2d(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+            
+        self.convs = nn.Sequential(*convs)
+
+
+    def forward(self, x):
+        return self.convs(x)
+
+
+# 3D Conv
+def get_conv3d(c1, c2, k, p, s, d, g, bias=False):
+    conv = nn.Conv3d(c1, c2, k, stride=s, padding=p, dilation=d, groups=g, bias=bias)
+    return conv
+
+
+def get_norm3d(norm_type, dim):
+    if norm_type == 'BN':
+        return nn.BatchNorm3d(dim)
+    elif norm_type == 'IN':
+        return nn.InstanceNorm3d(dim)
+
+
+class Conv3d(nn.Module):
+    def __init__(self, 
+                 c1,                   # in channels
+                 c2,                   # out channels 
+                 k=1,                  # kernel size 
+                 p=0,                  # padding
+                 s=1,                  # padding
+                 d=1,                  # dilation
+                 g=1,
+                 act_type='',          # activation
+                 norm_type='',         # normalization
+                 depthwise=False):
+        super(Conv3d, self).__init__()
+        convs = []
+        add_bias = False if norm_type else True
+        if depthwise:
+            assert c1 == c2, "In depthwise conv, the in_dim (c1) should be equal to out_dim (c2)."
+            convs.append(get_conv3d(c1, c2, k=k, p=p, s=s, d=d, g=c1, bias=add_bias))
+            # depthwise conv
+            if norm_type:
+                convs.append(get_norm3d(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+            # pointwise conv
+            convs.append(get_conv3d(c1, c2, k=1, p=0, s=1, d=d, g=1, bias=add_bias))
+            if norm_type:
+                convs.append(get_norm3d(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+
+        else:
+            convs.append(get_conv3d(c1, c2, k=k, p=p, s=s, d=d, g=g, bias=add_bias))
+            if norm_type:
+                convs.append(get_norm3d(norm_type, c2))
+            if act_type:
+                convs.append(get_activation(act_type))
+            
+        self.convs = nn.Sequential(*convs)
+
+
+    def forward(self, x):
+        return self.convs(x)
+
diff --git a/models/yowo/__pycache__/build.cpython-310.pyc b/models/yowo/__pycache__/build.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..060078f0f6beaf4017c163aa10490ad3ae9cf8e5
Binary files /dev/null and b/models/yowo/__pycache__/build.cpython-310.pyc differ
diff --git a/models/yowo/__pycache__/build.cpython-37.pyc b/models/yowo/__pycache__/build.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45973bc52ea3d9e8c590465b737f767174a71077
Binary files /dev/null and b/models/yowo/__pycache__/build.cpython-37.pyc differ
diff --git a/models/yowo/__pycache__/encoder.cpython-310.pyc b/models/yowo/__pycache__/encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ca7e31bbb225f01fec41e628bdf79e14aba8db6
Binary files /dev/null and b/models/yowo/__pycache__/encoder.cpython-310.pyc differ
diff --git a/models/yowo/__pycache__/encoder.cpython-37.pyc b/models/yowo/__pycache__/encoder.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d47e6ccec51018ba1614119490b047f3410c5f47
Binary files /dev/null and b/models/yowo/__pycache__/encoder.cpython-37.pyc differ
diff --git a/models/yowo/__pycache__/head.cpython-310.pyc b/models/yowo/__pycache__/head.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdfafc9073d217e1f9777be26acfba135d77dc9b
Binary files /dev/null and b/models/yowo/__pycache__/head.cpython-310.pyc differ
diff --git a/models/yowo/__pycache__/head.cpython-37.pyc b/models/yowo/__pycache__/head.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c0883b5046830b2e1acd5d842dbc8e10b5566a4
Binary files /dev/null and b/models/yowo/__pycache__/head.cpython-37.pyc differ
diff --git a/models/yowo/__pycache__/loss.cpython-310.pyc b/models/yowo/__pycache__/loss.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc2651060da37eec464c571bfa58c7d53c086989
Binary files /dev/null and b/models/yowo/__pycache__/loss.cpython-310.pyc differ
diff --git a/models/yowo/__pycache__/loss.cpython-37.pyc b/models/yowo/__pycache__/loss.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff793654d79f2f98a581003f79ea34c7e9d24fa9
Binary files /dev/null and b/models/yowo/__pycache__/loss.cpython-37.pyc differ
diff --git a/models/yowo/__pycache__/matcher.cpython-310.pyc b/models/yowo/__pycache__/matcher.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c9b248c3083093b5bb13b8eb1a3a17d903eb039
Binary files /dev/null and b/models/yowo/__pycache__/matcher.cpython-310.pyc differ
diff --git a/models/yowo/__pycache__/matcher.cpython-37.pyc b/models/yowo/__pycache__/matcher.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8eff850613dab62b38c1d802430f18cf4957f463
Binary files /dev/null and b/models/yowo/__pycache__/matcher.cpython-37.pyc differ
diff --git a/models/yowo/__pycache__/yowo.cpython-310.pyc b/models/yowo/__pycache__/yowo.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bea8714905383efa61d3db4233e6379a610dc789
Binary files /dev/null and b/models/yowo/__pycache__/yowo.cpython-310.pyc differ
diff --git a/models/yowo/__pycache__/yowo.cpython-37.pyc b/models/yowo/__pycache__/yowo.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea61b65f2ed43987ba745df0be27a66424c92d9d
Binary files /dev/null and b/models/yowo/__pycache__/yowo.cpython-37.pyc differ
diff --git a/models/yowo/build.py b/models/yowo/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f9a3138cd81fc7879ed83f3173a61194fcb765
--- /dev/null
+++ b/models/yowo/build.py
@@ -0,0 +1,55 @@
+import torch
+from .yowo import YOWO
+from .loss import build_criterion
+
+
+# build YOWO detector
+def build_yowo(args,
+                d_cfg,
+                m_cfg, 
+                device, 
+                num_classes=3, 
+                trainable=False,
+                resume=None):
+    print('==============================')
+    print('Build {} ...'.format(args.version.upper()))
+
+    # build YOWO
+    model = YOWO(
+        cfg = m_cfg,
+        device = device,
+        num_classes = num_classes,
+        conf_thresh = 0.15,
+        nms_thresh = 0.5,
+        topk = 40,
+        trainable = trainable,
+        multi_hot = d_cfg['multi_hot'],
+        )
+
+    if trainable:
+        # Freeze backbone
+        if args.freeze_backbone_2d:
+            print('Freeze 2D Backbone ...')
+            for m in model.backbone_2d.parameters():
+                m.requires_grad = False
+        if args.freeze_backbone_3d:
+            print('Freeze 3D Backbone ...')
+            for m in model.backbone_3d.parameters():
+                m.requires_grad = False
+            
+        # keep training       
+        if resume is not None:
+            print('keep training: ', resume)
+            checkpoint = torch.load(resume, map_location='cpu')
+            # checkpoint state dict
+            checkpoint_state_dict = checkpoint.pop("model")
+            model.load_state_dict(checkpoint_state_dict)
+
+        # build criterion
+        criterion = build_criterion(
+            args, d_cfg['train_size'], num_classes, d_cfg['multi_hot'])
+    
+    else:
+        criterion = None
+                        
+    return model, criterion
diff --git a/models/yowo/encoder.py b/models/yowo/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..5924850e733b6ad7b748903a0bca4e2f2172e4a0
--- /dev/null
+++ b/models/yowo/encoder.py
@@ -0,0 +1,147 @@
+import torch
+import torch.nn as nn
+from ..basic.conv import Conv2d
+
+
+# Channel Self Attetion Module
+class CSAM(nn.Module):
+    """ Channel attention module """
+    def __init__(self):
+        super(CSAM, self).__init__()
+        self.gamma = nn.Parameter(torch.zeros(1))
+        self.softmax  = nn.Softmax(dim=-1)
+
+
+    def forward(self, x):
+        """
+            inputs :
+                x : input feature maps( B x C x H x W )
+            returns :
+                out : attention value + input feature
+                attention: B x C x C
+        """
+        B, C, H, W = x.size()
+        # query / key / value
+        query = x.view(B, C, -1)
+        key = x.view(B, C, -1).permute(0, 2, 1)
+        value = x.view(B, C, -1)
+
+        # attention matrix
+        energy = torch.bmm(query, key)
+        energy_new = torch.max(energy, -1, keepdim=True)[0].expand_as(energy) - energy
+        attention = self.softmax(energy_new)
+
+        # attention
+        out = torch.bmm(attention, value)
+        out = out.view(B, C, H, W)
+
+        # output
+        out = self.gamma * out + x
+
+        return out
+
+
+# Spatial Self Attetion Module
+class SSAM(nn.Module):
+    """ Channel attention module """
+    def __init__(self):
+        super(SSAM, self).__init__()
+        self.gamma = nn.Parameter(torch.zeros(1))
+        self.softmax  = nn.Softmax(dim=-1)
+
+
+    def forward(self, x):
+        """
+            inputs :
+                x : input feature maps( B x C x H x W )
+            returns :
+                out : attention value + input feature
+                attention: B x C x C
+        """
+        B, C, H, W = x.size()
+        # query / key / value
+        query = x.view(B, C, -1).permute(0, 2, 1)   # [B, N, C]
+        key = x.view(B, C, -1)                      # [B, C, N]
+        value = x.view(B, C, -1).permute(0, 2, 1)   # [B, N, C]
+
+        # attention matrix
+        energy = torch.bmm(query, key)
+        energy_new = torch.max(energy, -1, keepdim=True)[0].expand_as(energy) - energy
+        attention = self.softmax(energy_new)
+
+        # attention
+        out = torch.bmm(attention, value)
+        out = out.permute(0, 2, 1).contiguous().view(B, C, H, W)
+
+        # output
+        out = self.gamma * out + x
+
+        return out
+
+
+# Channel Encoder
+class ChannelEncoder(nn.Module):
+    def __init__(self, in_dim, out_dim, act_type='', norm_type=''):
+        super().__init__()
+        self.fuse_convs = nn.Sequential(
+            Conv2d(in_dim, out_dim, k=1, act_type=act_type, norm_type=norm_type),
+            Conv2d(out_dim, out_dim, k=3, p=1, act_type=act_type, norm_type=norm_type),
+            CSAM(),
+            Conv2d(out_dim, out_dim, k=3, p=1, act_type=act_type, norm_type=norm_type),
+            nn.Dropout(0.1, inplace=False),
+            nn.Conv2d(out_dim, out_dim, kernel_size=1)
+        )
+
+    def forward(self, x1, x2):
+        """
+            x: [B, C, H, W]
+        """
+        x = torch.cat([x1, x2], dim=1)
+        # [B, CN, H, W] -> [B, C, H, W]
+        x = self.fuse_convs(x)
+
+        return x
+
+
+# Spatial Encoder
+class SpatialEncoder(nn.Module):
+    def __init__(self, in_dim, out_dim, act_type='', norm_type=''):
+        super().__init__()
+        self.fuse_convs = nn.Sequential(
+            Conv2d(in_dim, out_dim, k=1, act_type=act_type, norm_type=norm_type),
+            Conv2d(out_dim, out_dim, k=3, p=1, act_type=act_type, norm_type=norm_type),
+            SSAM(),
+            Conv2d(out_dim, out_dim, k=3, p=1, act_type=act_type, norm_type=norm_type),
+            nn.Dropout(0.1, inplace=False),
+            nn.Conv2d(out_dim, out_dim, kernel_size=1)
+        )
+
+    def forward(self, x):
+        """
+            x: [B, C, H, W]
+        """
+        x = self.fuse_convs(x)
+
+        return x
+
+
+def build_channel_encoder(cfg, in_dim, out_dim):
+    encoder = ChannelEncoder(
+            in_dim=in_dim,
+            out_dim=out_dim,
+            act_type=cfg['head_act'],
+            norm_type=cfg['head_norm']
+        )
+
+    return encoder
+
+
+def build_spatial_encoder(cfg, in_dim, out_dim):
+    encoder = SpatialEncoder(
+            in_dim=in_dim,
+            out_dim=out_dim,
+            act_type=cfg['head_act'],
+            norm_type=cfg['head_norm']
+        )
+
+    return encoder
diff --git a/models/yowo/head.py b/models/yowo/head.py
new file mode 100644
index 0000000000000000000000000000000000000000..893a6c64b3ace6f8635fc58fbcca9dd80770018f
--- /dev/null
+++ b/models/yowo/head.py
@@ -0,0 +1,47 @@
+import torch
+import torch.nn as nn
+
+from ..basic.conv import Conv2d
+
+
+class DecoupledHead(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        print('==============================')
+        print('Head: Decoupled Head')
+        self.num_cls_heads = cfg['num_cls_heads']
+        self.num_reg_heads = cfg['num_reg_heads']
+        self.act_type = cfg['head_act']
+        self.norm_type = cfg['head_norm']
+        self.head_dim = cfg['head_dim']
+        self.depthwise = cfg['head_depthwise']
+
+        self.cls_head = nn.Sequential(*[
+            Conv2d(self.head_dim, 
+                   self.head_dim, 
+                   k=3, p=1, s=1, 
+                   act_type=self.act_type, 
+                   norm_type=self.norm_type,
+                   depthwise=self.depthwise)
+                   for _ in range(self.num_cls_heads)])
+        self.reg_head = nn.Sequential(*[
+            Conv2d(self.head_dim, 
+                   self.head_dim, 
+                   k=3, p=1, s=1, 
+                   act_type=self.act_type, 
+                   norm_type=self.norm_type,
+                   depthwise=self.depthwise)
+                   for _ in range(self.num_reg_heads)])
+
+
+    def forward(self, cls_feat, reg_feat):
+        cls_feats = self.cls_head(cls_feat)
+        reg_feats = self.reg_head(reg_feat)
+
+        return cls_feats, reg_feats
+
+
+def build_head(cfg):
+    return DecoupledHead(cfg)
+    
\ No newline at end of file
diff --git a/models/yowo/loss.py b/models/yowo/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4833008d2d0ca719f90d846156686972c8a00236
--- /dev/null
+++ b/models/yowo/loss.py
@@ -0,0 +1,173 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .matcher import SimOTA
+from utils.box_ops import get_ious
+from utils.distributed_utils import get_world_size, is_dist_avail_and_initialized
+
+
+class SigmoidFocalLoss(object):
+    def __init__(self, alpha=0.25, gamma=2.0, reduction='none'):
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reduction = reduction
+
+    def __call__(self, logits, targets):      
+        p = torch.sigmoid(logits)
+        ce_loss = F.binary_cross_entropy_with_logits(input=logits, 
+                                                        target=targets, 
+                                                        reduction="none")
+        p_t = p * targets + (1.0 - p) * (1.0 - targets)
+        loss = ce_loss * ((1.0 - p_t) ** self.gamma)
+
+        if self.alpha >= 0:
+            alpha_t = self.alpha * targets + (1.0 - self.alpha) * (1.0 - targets)
+            loss = alpha_t * loss
+
+        if self.reduction == "mean":
+            loss = loss.mean()
+
+        elif self.reduction == "sum":
+            loss = loss.sum()
+
+        return loss
+
+
+class Criterion(object):
+    def __init__(self, args, img_size, num_classes=3, multi_hot=False):
+        self.num_classes = num_classes
+        self.img_size = img_size
+        self.loss_conf_weight = args.loss_conf_weight
+        self.loss_cls_weight = args.loss_cls_weight
+        self.loss_reg_weight = args.loss_reg_weight
+        self.focal_loss = args.focal_loss
+        self.multi_hot = multi_hot
+
+        # loss
+        self.obj_lossf = nn.BCEWithLogitsLoss(reduction='none')
+        self.cls_lossf = nn.BCEWithLogitsLoss(reduction='none')
+            
+        # matcher
+        self.matcher = SimOTA(
+            num_classes=num_classes,
+            center_sampling_radius=args.center_sampling_radius,
+            topk_candidate=args.topk_candicate
+            )
+
+    def __call__(self, outputs, targets):        
+        """
+            outputs['pred_conf']: List(Tensor) [B, M, 1]
+            outputs['pred_cls']: List(Tensor) [B, M, C]
+            outputs['pred_box']: List(Tensor) [B, M, 4]
+            outputs['strides']: List(Int) [8, 16, 32] output stride
+            targets: (List) [dict{'boxes': [...], 
+                                 'labels': [...], 
+                                 'orig_size': ...}, ...]
+        """
+        bs = outputs['pred_cls'][0].shape[0]
+        device = outputs['pred_cls'][0].device
+        fpn_strides = outputs['strides']
+        anchors = outputs['anchors']
+        # preds: [B, M, C]
+        conf_preds = torch.cat(outputs['pred_conf'], dim=1)
+        cls_preds = torch.cat(outputs['pred_cls'], dim=1)
+        box_preds = torch.cat(outputs['pred_box'], dim=1)
+
+        # label assignment
+        cls_targets = []
+        box_targets = []
+        conf_targets = []
+        fg_masks = []
+
+        for batch_idx in range(bs):
+            tgt_labels = targets[batch_idx]["labels"].to(device)
+            tgt_bboxes = targets[batch_idx]["boxes"].to(device)
+
+            # denormalize tgt_bbox
+            tgt_bboxes *= self.img_size
+
+            # check target
+            if len(tgt_labels) == 0 or tgt_bboxes.max().item() == 0.:
+                num_anchors = sum([ab.shape[0] for ab in anchors])
+                # There is no valid gt
+                cls_target = conf_preds.new_zeros((0, self.num_classes))
+                box_target = conf_preds.new_zeros((0, 4))
+                conf_target = conf_preds.new_zeros((num_anchors, 1))
+                fg_mask = conf_preds.new_zeros(num_anchors).bool()
+            else:
+                (
+                    gt_matched_classes,
+                    fg_mask,
+                    pred_ious_this_matching,
+                    matched_gt_inds,
+                    num_fg_img,
+                ) = self.matcher(
+                    fpn_strides = fpn_strides,
+                    anchors = anchors,
+                    pred_conf = conf_preds[batch_idx],
+                    pred_cls = cls_preds[batch_idx], 
+                    pred_box = box_preds[batch_idx],
+                    tgt_labels = tgt_labels,
+                    tgt_bboxes = tgt_bboxes,
+                    )
+
+                conf_target = fg_mask.unsqueeze(-1)
+                box_target = tgt_bboxes[matched_gt_inds]
+                if self.multi_hot:
+                    cls_target = gt_matched_classes.float()
+                else:
+                    cls_target = F.one_hot(gt_matched_classes.long(), self.num_classes)
+                cls_target = cls_target * pred_ious_this_matching.unsqueeze(-1)
+
+            cls_targets.append(cls_target)
+            box_targets.append(box_target)
+            conf_targets.append(conf_target)
+            fg_masks.append(fg_mask)
+
+        cls_targets = torch.cat(cls_targets, 0)
+        box_targets = torch.cat(box_targets, 0)
+        conf_targets = torch.cat(conf_targets, 0)
+        fg_masks = torch.cat(fg_masks, 0)
+        num_foregrounds = fg_masks.sum()
+
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_foregrounds)
+        num_foregrounds = (num_foregrounds / get_world_size()).clamp(1.0)
+
+        # conf loss
+        loss_conf = self.obj_lossf(conf_preds.view(-1, 1), conf_targets.float())
+        loss_conf = loss_conf.sum() / num_foregrounds
+        
+        # cls loss
+        matched_cls_preds = cls_preds.view(-1, self.num_classes)[fg_masks]
+        loss_cls = self.cls_lossf(matched_cls_preds, cls_targets)
+        loss_cls = loss_cls.sum() / num_foregrounds
+
+        # box loss
+        matched_box_preds = box_preds.view(-1, 4)[fg_masks]
+        ious = get_ious(matched_box_preds,
+                        box_targets,
+                        box_mode="xyxy",
+                        iou_type='giou')
+        loss_box = (1.0 - ious).sum() / num_foregrounds
+
+        # total loss
+        losses = self.loss_conf_weight * loss_conf + \
+                 self.loss_cls_weight * loss_cls + \
+                 self.loss_reg_weight * loss_box
+
+        loss_dict = dict(
+                loss_conf = loss_conf,
+                loss_cls = loss_cls,
+                loss_box = loss_box,
+                losses = losses
+        )
+
+        return loss_dict
+
+
+def build_criterion(args, img_size, num_classes, multi_hot=False):
+    criterion = Criterion(args, img_size, num_classes, multi_hot)
+    
+    return criterion
+    
\ No newline at end of file
diff --git a/models/yowo/matcher.py b/models/yowo/matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2b6fcf4efae99be7833c6c0958f55d5f52dfaf
--- /dev/null
+++ b/models/yowo/matcher.py
@@ -0,0 +1,201 @@
+import torch
+import torch.nn.functional as F
+from utils.box_ops import *
+
+
+
+# SimOTA
+class SimOTA(object):
+    def __init__(self, num_classes, center_sampling_radius, topk_candidate):
+        self.num_classes = num_classes
+        self.center_sampling_radius = center_sampling_radius
+        self.topk_candidate = topk_candidate
+
+
+    @torch.no_grad()
+    def __call__(self, 
+                 fpn_strides, 
+                 anchors, 
+                 pred_conf, 
+                 pred_cls, 
+                 pred_box, 
+                 tgt_labels,
+                 tgt_bboxes):
+        # [M,]
+        strides = torch.cat([torch.ones_like(anchor_i[:, 0]) * stride_i
+                                for stride_i, anchor_i in zip(fpn_strides, anchors)], dim=-1)
+        # List[F, M, 2] -> [M, 2]
+        anchors = torch.cat(anchors, dim=0)
+        num_anchor = anchors.shape[0]        
+        num_gt = len(tgt_labels)
+
+        # positive candidates
+        fg_mask, is_in_boxes_and_center = \
+            self.get_in_boxes_info(
+                tgt_bboxes,
+                anchors,
+                strides,
+                num_anchor,
+                num_gt
+                )
+
+        conf_preds_ = pred_conf[fg_mask]   # [Mp, 1]
+        cls_preds_ = pred_cls[fg_mask]   # [Mp, C]
+        box_preds_ = pred_box[fg_mask]   # [Mp, 4]
+        num_in_boxes_anchor = box_preds_.shape[0]
+
+        # [N, Mp]
+        pair_wise_ious, _ = box_iou(tgt_bboxes, box_preds_)
+        pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)
+
+        if len(tgt_labels.shape) == 1:
+            gt_cls = F.one_hot(tgt_labels.long(), self.num_classes)
+        elif len(tgt_labels.shape) == 2:
+            gt_cls = tgt_labels
+
+        # [N, C] -> [N, Mp, C]
+        gt_cls = gt_cls.float().unsqueeze(1).repeat(1, num_in_boxes_anchor, 1)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            score_preds_ = torch.sqrt(
+                cls_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
+                * conf_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
+            ) # [N, Mp, C]
+            pair_wise_cls_loss = F.binary_cross_entropy(
+                score_preds_, gt_cls, reduction="none"
+            ).sum(-1) # [N, Mp]
+        del score_preds_
+
+        cost = (
+            pair_wise_cls_loss
+            + 3.0 * pair_wise_ious_loss
+            + 100000.0 * (~is_in_boxes_and_center)
+        ) # [N, Mp]
+
+        (
+            num_fg,
+            gt_matched_classes,         # [num_fg,]
+            pred_ious_this_matching,    # [num_fg,]
+            matched_gt_inds,            # [num_fg,]
+        ) = self.dynamic_k_matching(
+            cost,
+            pair_wise_ious,
+            tgt_labels,
+            num_gt,
+            fg_mask
+            )
+        del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss
+
+        return (
+                gt_matched_classes,
+                fg_mask,
+                pred_ious_this_matching,
+                matched_gt_inds,
+                num_fg,
+        )
+
+
+    def get_in_boxes_info(
+        self,
+        gt_bboxes,   # [N, 4]
+        anchors,     # [M, 2]
+        strides,     # [M,]
+        num_anchors, # M
+        num_gt,      # N
+        ):
+        # anchor center
+        x_centers = anchors[:, 0]
+        y_centers = anchors[:, 1]
+
+        # [M,] -> [1, M] -> [N, M]
+        x_centers = x_centers.unsqueeze(0).repeat(num_gt, 1)
+        y_centers = y_centers.unsqueeze(0).repeat(num_gt, 1)
+
+        # [N,] -> [N, 1] -> [N, M]
+        gt_bboxes_l = gt_bboxes[:, 0].unsqueeze(1).repeat(1, num_anchors) # x1
+        gt_bboxes_t = gt_bboxes[:, 1].unsqueeze(1).repeat(1, num_anchors) # y1
+        gt_bboxes_r = gt_bboxes[:, 2].unsqueeze(1).repeat(1, num_anchors) # x2
+        gt_bboxes_b = gt_bboxes[:, 3].unsqueeze(1).repeat(1, num_anchors) # y2
+
+        b_l = x_centers - gt_bboxes_l
+        b_r = gt_bboxes_r - x_centers
+        b_t = y_centers - gt_bboxes_t
+        b_b = gt_bboxes_b - y_centers
+        bbox_deltas = torch.stack([b_l, b_t, b_r, b_b], 2)
+
+        is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0
+        is_in_boxes_all = is_in_boxes.sum(dim=0) > 0
+        # in fixed center
+        center_radius = self.center_sampling_radius
+
+        # [N, 2]
+        gt_centers = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) * 0.5
+        
+        # [1, M]
+        center_radius_ = center_radius * strides.unsqueeze(0)
+
+        gt_bboxes_l = gt_centers[:, 0].unsqueeze(1).repeat(1, num_anchors) - center_radius_ # x1
+        gt_bboxes_t = gt_centers[:, 1].unsqueeze(1).repeat(1, num_anchors) - center_radius_ # y1
+        gt_bboxes_r = gt_centers[:, 0].unsqueeze(1).repeat(1, num_anchors) + center_radius_ # x2
+        gt_bboxes_b = gt_centers[:, 1].unsqueeze(1).repeat(1, num_anchors) + center_radius_ # y2
+
+        c_l = x_centers - gt_bboxes_l
+        c_r = gt_bboxes_r - x_centers
+        c_t = y_centers - gt_bboxes_t
+        c_b = gt_bboxes_b - y_centers
+        center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2)
+        is_in_centers = center_deltas.min(dim=-1).values > 0.0
+        is_in_centers_all = is_in_centers.sum(dim=0) > 0
+
+        # in boxes and in centers
+        is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all
+
+        is_in_boxes_and_center = (
+            is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor]
+        )
+        return is_in_boxes_anchor, is_in_boxes_and_center
+    
+    
+    def dynamic_k_matching(
+        self, 
+        cost, 
+        pair_wise_ious, 
+        gt_classes, 
+        num_gt, 
+        fg_mask
+        ):
+        # Dynamic K
+        # ---------------------------------------------------------------
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+
+        ious_in_boxes_matrix = pair_wise_ious
+        n_candidate_k = min(self.topk_candidate, ious_in_boxes_matrix.size(1))
+        topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1)
+        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
+        dynamic_ks = dynamic_ks.tolist()
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[gt_idx], k=dynamic_ks[gt_idx], largest=False
+            )
+            matching_matrix[gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        anchor_matching_gt = matching_matrix.sum(0)
+        if (anchor_matching_gt > 1).sum() > 0:
+            _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)
+            matching_matrix[:, anchor_matching_gt > 1] *= 0
+            matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1
+        fg_mask_inboxes = matching_matrix.sum(0) > 0
+        num_fg = fg_mask_inboxes.sum().item()
+
+        fg_mask[fg_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+        gt_matched_classes = gt_classes[matched_gt_inds]
+
+        pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[
+            fg_mask_inboxes
+        ]
+        return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
+        
\ No newline at end of file
diff --git a/models/yowo/yowo.py b/models/yowo/yowo.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7f158c97588c6d4e0b67d78355619f2ba7e5e94
--- /dev/null
+++ b/models/yowo/yowo.py
@@ -0,0 +1,481 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..backbone import build_backbone_2d
+from ..backbone import build_backbone_3d
+from .encoder import build_channel_encoder
+from .head import build_head
+
+from utils.nms import multiclass_nms
+
+import time
+import multiprocessing
+import concurrent.futures
+import torch.multiprocessing as mp
+from multiprocessing import Queue, Process
+import threading
+
+# class ModelThread(threading.Thread):
+#     def __init__(self, model, *args, **kwargs):
+#         super(ModelThread, self).__init__()
+#         self.model = model
+#         self.args = args
+#         self.kwargs = kwargs
+#         self.result = None
+
+#     def run(self):
+#         self.result = self.model(*self.args, **self.kwargs)
+# You Only Watch Once
+class YOWO(nn.Module):
+    def __init__(self, 
+                 cfg,
+                 device,
+                 num_classes = 3, 
+                 conf_thresh = 0.05,
+                 nms_thresh = 0.6,
+                 topk = 40,
+                 trainable = False,
+                 multi_hot = False):
+        super(YOWO, self).__init__()
+        self.cfg = cfg
+        self.device = device
+        self.stride = cfg['stride']
+        self.num_classes = num_classes
+        self.trainable = trainable
+        self.conf_thresh = conf_thresh
+        self.nms_thresh = nms_thresh
+        self.topk = topk
+        self.multi_hot = multi_hot
+
+        # ------------------ Network ---------------------
+        ## 2D backbone
+        self.backbone_2d, bk_dim_2d = build_backbone_2d(
+            cfg, pretrained=cfg['pretrained_2d'] and trainable)
+            
+        ## 3D backbone
+        self.backbone_3d, bk_dim_3d = build_backbone_3d(
+            cfg, pretrained=cfg['pretrained_3d'] and trainable)
+
+        ## cls channel encoder
+        self.cls_channel_encoders = nn.ModuleList(
+            [build_channel_encoder(cfg, bk_dim_2d[i]+bk_dim_3d, cfg['head_dim'])
+                for i in range(len(cfg['stride']))])
+            
+        ## reg channel & spatial encoder
+        self.reg_channel_encoders = nn.ModuleList(
+            [build_channel_encoder(cfg, bk_dim_2d[i]+bk_dim_3d, cfg['head_dim'])
+                for i in range(len(cfg['stride']))])
+
+        ## head
+        self.heads = nn.ModuleList(
+            [build_head(cfg) for _ in range(len(cfg['stride']))]
+        ) 
+
+        ## pred
+        head_dim = cfg['head_dim']
+        self.conf_preds = nn.ModuleList(
+            [nn.Conv2d(head_dim, 1, kernel_size=1)
+                for _ in range(len(cfg['stride']))
+                ]) 
+        self.cls_preds = nn.ModuleList(
+            [nn.Conv2d(head_dim, self.num_classes, kernel_size=1)
+                for _ in range(len(cfg['stride']))
+                ]) 
+        self.reg_preds = nn.ModuleList(
+            [nn.Conv2d(head_dim, 4, kernel_size=1) 
+                for _ in range(len(cfg['stride']))
+                ])                 
+
+        # init yowo
+        self.init_yowo()
+
+
+
+    def init_yowo(self): 
+        # Init yolo
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eps = 1e-3
+                m.momentum = 0.03
+                
+        # Init bias
+        init_prob = 0.01
+        bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
+        # obj pred
+        for conf_pred in self.conf_preds:
+            b = conf_pred.bias.view(1, -1)
+            b.data.fill_(bias_value.item())
+            conf_pred.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+        # cls pred
+        for cls_pred in self.cls_preds:
+            b = cls_pred.bias.view(1, -1)
+            b.data.fill_(bias_value.item())
+            cls_pred.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+
+    def generate_anchors(self, fmp_size, stride):
+        """
+            fmp_size: (List) [H, W]
+        """
+        # generate grid cells
+        fmp_h, fmp_w = fmp_size
+        anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
+        # [H, W, 2] -> [HW, 2]
+        anchor_xy = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2) + 0.5
+        anchor_xy *= stride
+        anchors = anchor_xy.to(self.device)
+
+        return anchors
+        
+
+    def decode_boxes(self, anchors, pred_reg, stride):
+        """
+            anchors:  (List[Tensor]) [1, M, 2] or [M, 2]
+            pred_reg: (List[Tensor]) [B, M, 4] or [B, M, 4]
+        """
+        # center of bbox
+        pred_ctr_xy = anchors + pred_reg[..., :2] * stride
+        # size of bbox
+        pred_box_wh = pred_reg[..., 2:].exp() * stride
+
+        pred_x1y1 = pred_ctr_xy - 0.5 * pred_box_wh
+        pred_x2y2 = pred_ctr_xy + 0.5 * pred_box_wh
+        pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
+
+        return pred_box
+
+
+    def post_process_one_hot(self, conf_preds, cls_preds, reg_preds, anchors):
+        """
+        Input:
+            conf_preds: (Tensor) [H x W, 1]
+            cls_preds: (Tensor) [H x W, C]
+            reg_preds: (Tensor) [H x W, 4]
+        """
+        
+        all_scores = []
+        all_labels = []
+        all_bboxes = []
+        
+        for level, (conf_pred_i, cls_pred_i, reg_pred_i, anchors_i) in enumerate(zip(conf_preds, cls_preds, reg_preds, anchors)):
+            # (H x W x C,)
+            scores_i = (torch.sqrt(conf_pred_i.sigmoid() * cls_pred_i.sigmoid())).flatten()
+
+            # Keep top k top scoring indices only.
+            num_topk = min(self.topk, reg_pred_i.size(0))
+
+            # torch.sort is actually faster than .topk (at least on GPUs)
+            predicted_prob, topk_idxs = scores_i.sort(descending=True)
+            topk_scores = predicted_prob[:num_topk]
+            topk_idxs = topk_idxs[:num_topk]
+
+            # filter out the proposals with low confidence score
+            keep_idxs = topk_scores > self.conf_thresh
+            scores = topk_scores[keep_idxs]
+            topk_idxs = topk_idxs[keep_idxs]
+
+            anchor_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor')
+            labels = topk_idxs % self.num_classes
+
+            reg_pred_i = reg_pred_i[anchor_idxs]
+            anchors_i = anchors_i[anchor_idxs]
+
+            # decode box: [M, 4]
+            bboxes = self.decode_boxes(anchors_i, reg_pred_i, self.stride[level])
+
+            all_scores.append(scores)
+            all_labels.append(labels)
+            all_bboxes.append(bboxes)
+
+        scores = torch.cat(all_scores)
+        labels = torch.cat(all_labels)
+        bboxes = torch.cat(all_bboxes)
+
+        # to cpu
+        scores = scores.cpu().numpy()
+        labels = labels.cpu().numpy()
+        bboxes = bboxes.cpu().numpy()
+
+        # nms
+        scores, labels, bboxes = multiclass_nms(
+            scores, labels, bboxes, self.nms_thresh, self.num_classes, False)
+
+        return scores, labels, bboxes
+    
+
+    def post_process_multi_hot(self, conf_preds, cls_preds, reg_preds, anchors):
+        """
+        Input:
+            cls_pred: (Tensor) [H x W, C]
+            reg_pred: (Tensor) [H x W, 4]
+        """        
+        all_conf_preds = []
+        all_cls_preds = []
+        all_box_preds = []
+        for level, (conf_pred_i, cls_pred_i, reg_pred_i, anchors_i) in enumerate(zip(conf_preds, cls_preds, reg_preds, anchors)):
+            # decode box
+            box_pred_i = self.decode_boxes(anchors_i, reg_pred_i, self.stride[level])
+            
+            # conf pred 
+            conf_pred_i = torch.sigmoid(conf_pred_i.squeeze(-1))   # [M,]
+
+            # cls_pred
+            cls_pred_i = torch.sigmoid(cls_pred_i)                 # [M, C]
+
+            # topk
+            topk_conf_pred_i, topk_inds = torch.topk(conf_pred_i, self.topk)
+            topk_cls_pred_i = cls_pred_i[topk_inds]
+            topk_box_pred_i = box_pred_i[topk_inds]
+
+            # threshold
+            keep = topk_conf_pred_i.gt(self.conf_thresh)
+            topk_conf_pred_i = topk_conf_pred_i[keep]
+            topk_cls_pred_i = topk_cls_pred_i[keep]
+            topk_box_pred_i = topk_box_pred_i[keep]
+
+            all_conf_preds.append(topk_conf_pred_i)
+            all_cls_preds.append(topk_cls_pred_i)
+            all_box_preds.append(topk_box_pred_i)
+
+        # concatenate
+        conf_preds = torch.cat(all_conf_preds, dim=0)  # [M,]
+        cls_preds = torch.cat(all_cls_preds, dim=0)    # [M, C]
+        box_preds = torch.cat(all_box_preds, dim=0)    # [M, 4]
+
+        # to cpu - numpy
+        scores = conf_preds.cpu().numpy()
+        labels = cls_preds.cpu().numpy()
+        bboxes = box_preds.cpu().numpy()
+
+        #torch
+        # scores = conf_preds
+        # labels = cls_preds 
+        # bboxes = box_preds
+        # nms
+        scores, labels, bboxes = multiclass_nms(
+            scores, labels, bboxes, self.nms_thresh, self.num_classes, True)
+
+        # [M, 5 + C]
+        #numpy
+        out_boxes = np.concatenate([bboxes, scores[..., None], labels], axis=-1)
+        #torch
+        #out_boxes = torch.cat([bboxes, scores.unsqueeze(-1), labels], dim=-1)
+
+        return out_boxes
+    
+
+    @torch.no_grad()
+    # def process_2d(self, key_frame, queue):
+    #     cls_feats, reg_feats = self.backbone_2d(key_frame)
+    #     queue.put((cls_feats, reg_feats))
+    #     #return 
+    # def process_3d(self, video_clips, queue):
+    #     feat_3d = self.backbone_3d(video_clips)
+    #     queue.put(feat_3d)
+        #return 
+    def inference(self, video_clips):
+        """
+        Input:
+            video_clips: (Tensor) -> [B, 3, T, H, W].
+        return:
+        """
+        # queue_3d = Queue()
+        # queue_2d = Queue()
+        B, _, _, img_h, img_w = video_clips.shape
+
+        # key frame
+        key_frame = video_clips[:, :, -1, :, :]
+        # # 3D backbone
+        # p1 = Process(target=self.process_3d, args=(video_clips, queue_3d))
+        # p2 = Process(target=self.process_2d, args=(key_frame, queue_2d))
+        # p1.start()
+        # p2.start()
+        # p1.join()
+        # p2.join()
+        # feat_3d = queue_3d.get()
+        # cls_feats, reg_feats = queue_2d.get()
+        # feat_3d_thread = ModelThread(self.backbone_3d, video_clips)
+        # feat_2d_thread = ModelThread(self.backbone_2d, key_frame)
+        # feat_3d_thread.start()
+        # feat_2d_thread.start()
+
+        # # Chờ cho các thread hoàn thành
+        # feat_3d_thread.join()
+        # feat_2d_thread.join()
+        # feat_3d = feat_3d_thread.result
+        # cls_feats, reg_feats = feat_2d_thread.result
+        feat_3d = self.backbone_3d(video_clips)
+        #time_3d_end = time.time()
+        #print("3d_time: ", time_3d_end -  time_3d_start)
+        #2D backbone
+        #time_2d_start = time.time()
+        cls_feats, reg_feats = self.backbone_2d(key_frame)
+        #time_2d_end = time.time()
+        #print("2d_time: ", time_2d_end -  time_2d_start)
+        #print("total_time: ", time_2d_end - time_3d_start)
+        # non-shared heads
+        all_conf_preds = []
+        all_cls_preds = []
+        all_reg_preds = []
+        all_anchors = []
+        for level, (cls_feat, reg_feat) in enumerate(zip(cls_feats, reg_feats)):
+            # upsample
+            feat_3d_up = F.interpolate(feat_3d, scale_factor=2 ** (2 - level))
+
+            # encoder
+            cls_feat = self.cls_channel_encoders[level](cls_feat, feat_3d_up)
+            reg_feat = self.reg_channel_encoders[level](reg_feat, feat_3d_up)
+
+            # head
+            cls_feat, reg_feat = self.heads[level](cls_feat, reg_feat)
+
+            # pred
+            conf_pred = self.conf_preds[level](reg_feat)
+            cls_pred = self.cls_preds[level](cls_feat)
+            reg_pred = self.reg_preds[level](reg_feat)
+        
+            # generate anchors
+            fmp_size = conf_pred.shape[-2:]
+            anchors = self.generate_anchors(fmp_size, self.stride[level])
+
+            # [B, C, H, W] -> [B, H, W, C] -> [B, M, C], M = HW
+            conf_pred = conf_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 1)
+            cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, self.num_classes)
+            reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(B, -1, 4)
+
+            all_conf_preds.append(conf_pred)
+            all_cls_preds.append(cls_pred)
+            all_reg_preds.append(reg_pred)
+            all_anchors.append(anchors)
+        
+        # batch process
+        if self.multi_hot:
+            batch_bboxes = []
+            for batch_idx in range(video_clips.size(0)):
+                cur_conf_preds = []
+                cur_cls_preds = []
+                cur_reg_preds = []
+                for conf_preds, cls_preds, reg_preds in zip(all_conf_preds, all_cls_preds, all_reg_preds):
+                    # [B, M, C] -> [M, C]
+                    cur_conf_preds.append(conf_preds[batch_idx])
+                    cur_cls_preds.append(cls_preds[batch_idx])
+                    cur_reg_preds.append(reg_preds[batch_idx])
+
+                # post-process
+                out_boxes = self.post_process_multi_hot(
+                    cur_conf_preds, cur_cls_preds, cur_reg_preds, all_anchors)
+
+                # normalize bbox
+
+                # max_dim = torch.max(torch.tensor([img_h, img_w]))
+                # out_boxes[..., :4] /= max_dim
+                #original
+                out_boxes[..., :4] /= max(img_h, img_w)
+                out_boxes[..., :4] = out_boxes[..., :4].clip(0., 1.)
+
+                batch_bboxes.append(out_boxes)
+
+            return batch_bboxes
+
+        else:
+            batch_scores = []
+            batch_labels = []
+            batch_bboxes = []
+            for batch_idx in range(conf_pred.size(0)):
+                # [B, M, C] -> [M, C]
+                cur_conf_preds = []
+                cur_cls_preds = []
+                cur_reg_preds = []
+                for conf_preds, cls_preds, reg_preds in zip(all_conf_preds, all_cls_preds, all_reg_preds):
+                    # [B, M, C] -> [M, C]
+                    cur_conf_preds.append(conf_preds[batch_idx])
+                    cur_cls_preds.append(cls_preds[batch_idx])
+                    cur_reg_preds.append(reg_preds[batch_idx])
+
+                # post-process
+                scores, labels, bboxes = self.post_process_one_hot(
+                    cur_conf_preds, cur_cls_preds, cur_reg_preds, all_anchors)
+
+                # normalize bbox
+                bboxes /= max(img_h, img_w)
+                bboxes = bboxes.clip(0., 1.)
+
+                batch_scores.append(scores)
+                batch_labels.append(labels)
+                batch_bboxes.append(bboxes)
+
+            return batch_scores, batch_labels, batch_bboxes
+
+
+    def forward(self, video_clips):
+        """
+        Input:
+            video_clips: (Tensor) -> [B, 3, T, H, W].
+        return:
+            outputs: (Dict) -> {
+                'pred_conf': (Tensor) [B, M, 1]
+                'pred_cls':  (Tensor) [B, M, C]
+                'pred_reg':  (Tensor) [B, M, 4]
+                'anchors':   (Tensor) [M, 2]
+                'stride':    (Int)
+            }
+        """                        
+        if not self.trainable:
+            return self.inference(video_clips)
+        else:
+            # key frame
+            key_frame = video_clips[:, :, -1, :, :]
+            # 3D backbone
+            feat_3d = self.backbone_3d(video_clips)
+
+            # 2D backbone
+            cls_feats, reg_feats = self.backbone_2d(key_frame)
+
+            # non-shared heads
+            all_conf_preds = []
+            all_cls_preds = []
+            all_box_preds = []
+            all_anchors = []
+            for level, (cls_feat, reg_feat) in enumerate(zip(cls_feats, reg_feats)):
+                # upsample
+                feat_3d_up = F.interpolate(feat_3d, scale_factor=2 ** (2 - level))
+
+                # encoder
+                cls_feat = self.cls_channel_encoders[level](cls_feat, feat_3d_up)
+                reg_feat = self.reg_channel_encoders[level](reg_feat, feat_3d_up)
+
+                # head
+                cls_feat, reg_feat = self.heads[level](cls_feat, reg_feat)
+
+                # pred
+                conf_pred = self.conf_preds[level](reg_feat)
+                cls_pred = self.cls_preds[level](cls_feat)
+                reg_pred = self.reg_preds[level](reg_feat)
+        
+                # generate anchors
+                fmp_size = conf_pred.shape[-2:]
+                anchors = self.generate_anchors(fmp_size, self.stride[level])
+
+                # [B, C, H, W] -> [B, H, W, C] -> [B, M, C]
+                conf_pred = conf_pred.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+                reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().flatten(1, 2)
+
+                # decode box: [M, 4]
+                box_pred = self.decode_boxes(anchors, reg_pred, self.stride[level])
+
+                all_conf_preds.append(conf_pred)
+                all_cls_preds.append(cls_pred)
+                all_box_preds.append(box_pred)
+                all_anchors.append(anchors)
+            
+            # output dict
+            outputs = {"pred_conf": all_conf_preds,       # List(Tensor) [B, M, 1]
+                       "pred_cls": all_cls_preds,         # List(Tensor) [B, M, C]
+                       "pred_box": all_box_preds,         # List(Tensor) [B, M, 4]
+                       "anchors": all_anchors,            # List(Tensor) [B, M, 2]
+                       "strides": self.stride}            # List(Int)
+
+            return outputs
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/utils/__pycache__/__init__.cpython-310.pyc b/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be4986cee2c9d797976c0b4f5717ec2f41e7c98f
Binary files /dev/null and b/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/utils/__pycache__/__init__.cpython-37.pyc b/utils/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f34e182c15839eb83f82be7c0dc11748c9ca9d2
Binary files /dev/null and b/utils/__pycache__/__init__.cpython-37.pyc differ
diff --git a/utils/__pycache__/box_ops.cpython-310.pyc b/utils/__pycache__/box_ops.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9df824f605449ea56d3269a2ba8d95888dfdd56
Binary files /dev/null and b/utils/__pycache__/box_ops.cpython-310.pyc differ
diff --git a/utils/__pycache__/box_ops.cpython-37.pyc b/utils/__pycache__/box_ops.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90bd7d76e0cbe9bdb5e085702f72225f2fbff11a
Binary files /dev/null and b/utils/__pycache__/box_ops.cpython-37.pyc differ
diff --git a/utils/__pycache__/com_flops_params.cpython-310.pyc b/utils/__pycache__/com_flops_params.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ddee4dd15b63af5d235ee1f5c596318164fae2c
Binary files /dev/null and b/utils/__pycache__/com_flops_params.cpython-310.pyc differ
diff --git a/utils/__pycache__/distributed_utils.cpython-310.pyc b/utils/__pycache__/distributed_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e780f4005d174e6013cff37712c6b71b3913e8fb
Binary files /dev/null and b/utils/__pycache__/distributed_utils.cpython-310.pyc differ
diff --git a/utils/__pycache__/distributed_utils.cpython-37.pyc b/utils/__pycache__/distributed_utils.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40790e4c1f406540cd870e11fd44012cafd172c5
Binary files /dev/null and b/utils/__pycache__/distributed_utils.cpython-37.pyc differ
diff --git a/utils/__pycache__/misc.cpython-310.pyc b/utils/__pycache__/misc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d92755ece4159ad4d44fc091400838963053a6b5
Binary files /dev/null and b/utils/__pycache__/misc.cpython-310.pyc differ
diff --git a/utils/__pycache__/misc.cpython-37.pyc b/utils/__pycache__/misc.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ab8eb44c0de9566b0a9a4fbf0f3fc17c3628410
Binary files /dev/null and b/utils/__pycache__/misc.cpython-37.pyc differ
diff --git a/utils/__pycache__/nms.cpython-310.pyc b/utils/__pycache__/nms.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f626e3568ab0f0af90f0e6015501f41ba0518a2e
Binary files /dev/null and b/utils/__pycache__/nms.cpython-310.pyc differ
diff --git a/utils/__pycache__/nms.cpython-37.pyc b/utils/__pycache__/nms.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c21ae3e0a7ab91386223e839954627ab40bae0b8
Binary files /dev/null and b/utils/__pycache__/nms.cpython-37.pyc differ
diff --git a/utils/box_ops.py b/utils/box_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..92c882f06e8e8b4ba0d01a952407c8e91295d4e9
--- /dev/null
+++ b/utils/box_ops.py
@@ -0,0 +1,92 @@
+import numpy as np
+import torch
+from torchvision.ops.boxes import box_area
+
+
+def get_ious(bboxes1,
+             bboxes2,
+             box_mode="xyxy",
+             iou_type="iou"):
+    """
+    Compute iou loss of type ['iou', 'giou', 'linear_iou']
+
+    Args:
+        inputs (tensor): pred values
+        targets (tensor): target values
+        weight (tensor): loss weight
+        box_mode (str): 'xyxy' or 'ltrb', 'ltrb' is currently supported.
+        loss_type (str): 'giou' or 'iou' or 'linear_iou'
+        reduction (str): reduction manner
+
+    Returns:
+        loss (tensor): computed iou loss.
+    """
+    if box_mode == "ltrb":
+        bboxes1 = torch.cat((-bboxes1[..., :2], bboxes1[..., 2:]), dim=-1)
+        bboxes2 = torch.cat((-bboxes2[..., :2], bboxes2[..., 2:]), dim=-1)
+    elif box_mode != "xyxy":
+        raise NotImplementedError
+
+    eps = torch.finfo(torch.float32).eps
+
+    bboxes1_area = (bboxes1[..., 2] - bboxes1[..., 0]).clamp_(min=0) \
+        * (bboxes1[..., 3] - bboxes1[..., 1]).clamp_(min=0)
+    bboxes2_area = (bboxes2[..., 2] - bboxes2[..., 0]).clamp_(min=0) \
+        * (bboxes2[..., 3] - bboxes2[..., 1]).clamp_(min=0)
+
+    w_intersect = (torch.min(bboxes1[..., 2], bboxes2[..., 2])
+                   - torch.max(bboxes1[..., 0], bboxes2[..., 0])).clamp_(min=0)
+    h_intersect = (torch.min(bboxes1[..., 3], bboxes2[..., 3])
+                   - torch.max(bboxes1[..., 1], bboxes2[..., 1])).clamp_(min=0)
+
+    area_intersect = w_intersect * h_intersect
+    area_union = bboxes2_area + bboxes1_area - area_intersect
+    ious = area_intersect / area_union.clamp(min=eps)
+
+    if iou_type == "iou":
+        return ious
+    elif iou_type == "giou":
+        g_w_intersect = torch.max(bboxes1[..., 2], bboxes2[..., 2]) \
+            - torch.min(bboxes1[..., 0], bboxes2[..., 0])
+        g_h_intersect = torch.max(bboxes1[..., 3], bboxes2[..., 3]) \
+            - torch.min(bboxes1[..., 1], bboxes2[..., 1])
+        ac_uion = g_w_intersect * g_h_intersect
+        gious = ious - (ac_uion - area_union) / ac_uion.clamp(min=eps)
+        return gious
+    else:
+        raise NotImplementedError
+
+
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def rescale_bboxes(bboxes, orig_size):
+    orig_w, orig_h = orig_size[0], orig_size[1]
+    bboxes[..., [0, 2]] = np.clip(
+        bboxes[..., [0, 2]] * orig_w, a_min=0., a_max=orig_w
+        )
+    bboxes[..., [1, 3]] = np.clip(
+        bboxes[..., [1, 3]] * orig_h, a_min=0., a_max=orig_h
+        )
+    
+    return bboxes
+
+
+
+if __name__ == '__main__':
+    box1 = torch.tensor([[10, 10, 20, 20]])
+    box2 = torch.tensor([[15, 15, 25, 25]])
diff --git a/utils/com_flops_params.py b/utils/com_flops_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..989570ded715b562d1f1d3f8fc652d360cc48425
--- /dev/null
+++ b/utils/com_flops_params.py
@@ -0,0 +1,25 @@
+import torch
+from thop import profile
+
+
+def FLOPs_and_Params(model, img_size, len_clip, device):
+    # generate init video clip
+    video_clip = torch.randn(1, 3, len_clip, img_size, img_size).to(device)
+
+    # set eval mode
+    model.trainable = False
+    model.eval()
+
+    print('==============================')
+    flops, params = profile(model, inputs=(video_clip, ))
+    print('==============================')
+    print('FLOPs : {:.2f} G'.format(flops / 1e9))
+    print('Params : {:.2f} M'.format(params / 1e6))
+    
+    # set train mode.
+    model.trainable = True
+    model.train()
+
+
+if __name__ == "__main__":
+    pass
diff --git a/utils/distributed_utils.py b/utils/distributed_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..82cc36e9cc93e6c20c9f9f30b1d90fbf3f11489f
--- /dev/null
+++ b/utils/distributed_utils.py
@@ -0,0 +1,166 @@
+# from github: https://github.com/ruinmessi/ASFF/blob/master/utils/distributed_util.py
+
+import torch
+import torch.distributed as dist
+import os
+import subprocess
+import pickle
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
+    sha = 'N/A'
+    diff = "clean"
+    branch = 'N/A'
+    try:
+        sha = _run(['git', 'rev-parse', 'HEAD'])
+        subprocess.check_output(['git', 'diff'], cwd=cwd)
+        diff = _run(['git', 'diff-index', 'HEAD'])
+        diff = "has uncommited changes" if diff else "clean"
+        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
diff --git a/utils/misc.py b/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..559da6196b5f87ae334e89934819d4e1dabb9fd4
--- /dev/null
+++ b/utils/misc.py
@@ -0,0 +1,191 @@
+import os
+
+import torch
+import torch.nn as nn
+
+from dataset.ucf_jhmdb import UCF_JHMDB_Dataset
+from dataset.ava import AVA_Dataset
+from dataset.transforms import Augmentation, BaseTransform
+
+from evaluator.ucf_jhmdb_evaluator import UCF_JHMDB_Evaluator
+from evaluator.ava_evaluator import AVA_Evaluator
+
+
+def build_dataset(d_cfg, args, is_train=False):
+    """
+        d_cfg: dataset config
+    """
+    # transform
+    augmentation = Augmentation(
+        img_size=d_cfg['train_size'],
+        jitter=d_cfg['jitter'],
+        hue=d_cfg['hue'],
+        saturation=d_cfg['saturation'],
+        exposure=d_cfg['exposure']
+        )
+    basetransform = BaseTransform(
+        img_size=d_cfg['test_size'],
+        )
+
+    # dataset
+    if args.dataset in ['ucf24', 'jhmdb21']:
+        data_dir = os.path.join(args.root, 'ucf24')
+
+        # dataset
+        dataset = UCF_JHMDB_Dataset(
+            data_root=data_dir,
+            dataset=args.dataset,
+            img_size=d_cfg['train_size'],
+            transform=augmentation,
+            is_train=is_train,
+            len_clip=args.len_clip,
+            sampling_rate=d_cfg['sampling_rate']
+            )
+        num_classes = dataset.num_classes
+
+        # evaluator
+        evaluator = UCF_JHMDB_Evaluator(
+            data_root=data_dir,
+            dataset=args.dataset,
+            model_name=args.version,
+            metric='fmap',
+            img_size=d_cfg['test_size'],
+            len_clip=args.len_clip,
+            batch_size=args.test_batch_size,
+            conf_thresh=0.01,
+            iou_thresh=0.5,
+            gt_folder=d_cfg['gt_folder'],
+            save_path='./evaluator/eval_results/',
+            transform=basetransform,
+            collate_fn=CollateFunc()            
+        )
+
+    elif args.dataset == 'ava_v2.2':
+        #data_dir = os.path.join(args.root, 'AVA_Dataset')
+        data_dir = args.root
+        
+        # dataset
+        dataset = AVA_Dataset(
+            cfg=d_cfg,
+            data_root=data_dir,
+            is_train=True,
+            img_size=d_cfg['train_size'],
+            transform=augmentation,
+            len_clip=args.len_clip,
+            sampling_rate=d_cfg['sampling_rate']
+        )
+        num_classes = 3
+
+        # evaluator
+        evaluator = AVA_Evaluator(
+            d_cfg=d_cfg,
+            data_root=data_dir,
+            img_size=d_cfg['test_size'],
+            len_clip=args.len_clip,
+            sampling_rate=d_cfg['sampling_rate'],
+            batch_size=args.test_batch_size,
+            transform=basetransform,
+            collate_fn=CollateFunc(),
+            full_test_on_val=False,
+            version='v2.2'
+            )
+
+    else:
+        print('unknow dataset !! Only support ucf24 & jhmdb21 & ava_v2.2 !!')
+        exit(0)
+
+    print('==============================')
+    print('Training model on:', args.dataset)
+    print('The dataset size:', len(dataset))
+
+    if not args.eval:
+        # no evaluator during training stage
+        evaluator = None
+
+    return dataset, evaluator, num_classes
+
+
+def build_dataloader(args, dataset, batch_size, collate_fn=None, is_train=False):
+    if is_train:
+        # distributed
+        if args.distributed:
+            sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+        else:
+            sampler = torch.utils.data.RandomSampler(dataset)
+
+        batch_sampler_train = torch.utils.data.BatchSampler(sampler, 
+                                                            batch_size, 
+                                                            drop_last=True)
+        # train dataloader
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset, 
+            batch_sampler=batch_sampler_train,
+            collate_fn=collate_fn, 
+            num_workers=args.num_workers,
+            pin_memory=True
+            )
+    else:
+        # test dataloader
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset, 
+            shuffle=False,
+            collate_fn=collate_fn, 
+            num_workers=args.num_workers,
+            drop_last=False,
+            pin_memory=True
+            )
+    
+    return dataloader
+    
+
+def load_weight(model, path_to_ckpt=None):
+    if path_to_ckpt is None:
+        print('No trained weight ..')
+        return model
+        
+    checkpoint = torch.load(path_to_ckpt, map_location='cpu')
+    # checkpoint state dict
+    checkpoint_state_dict = checkpoint.pop("model")
+    # model state dict
+    model_state_dict = model[0].state_dict()
+    # check
+    for k in list(checkpoint_state_dict.keys()):
+        if k in model_state_dict:
+            shape_model = tuple(model_state_dict[k].shape)
+            shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
+            if shape_model != shape_checkpoint:
+                checkpoint_state_dict.pop(k)
+        else:
+            checkpoint_state_dict.pop(k)
+            print(k)
+
+    model[0].load_state_dict(checkpoint_state_dict)
+    print('Finished loading model!')
+
+    return model[0]
+
+
+def is_parallel(model):
+    # Returns True if model is of type DP or DDP
+    return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel)
+
+
+class CollateFunc(object):
+    def __call__(self, batch):
+        batch_frame_id = []
+        batch_key_target = []
+        batch_video_clips = []
+
+        for sample in batch:
+            key_frame_id = sample[0]
+            video_clip = sample[1]
+            key_target = sample[2]
+            
+            batch_frame_id.append(key_frame_id)
+            batch_video_clips.append(video_clip)
+            batch_key_target.append(key_target)
+
+        # List [B, 3, T, H, W] -> [B, 3, T, H, W]
+        batch_video_clips = torch.stack(batch_video_clips)
+        
+        return batch_frame_id, batch_video_clips, batch_key_target
diff --git a/utils/nms.py b/utils/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..242f2d6750a737b1839c667d81218017dd5507a0
--- /dev/null
+++ b/utils/nms.py
@@ -0,0 +1,144 @@
+import numpy as np
+
+
+def nms(bboxes, scores, nms_thresh):
+    """"Pure Python NMS."""
+    x1 = bboxes[:, 0]  #xmin
+    y1 = bboxes[:, 1]  #ymin
+    x2 = bboxes[:, 2]  #xmax
+    y2 = bboxes[:, 3]  #ymax
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        # compute iou
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(1e-10, xx2 - xx1)
+        h = np.maximum(1e-10, yy2 - yy1)
+        inter = w * h
+
+        iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
+        #reserve all the boundingbox whose ovr less than thresh
+        inds = np.where(iou <= nms_thresh)[0]
+        order = order[inds + 1]
+
+    return keep
+
+
+def multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh):
+    # nms
+    keep = nms(bboxes, scores, nms_thresh)
+
+    scores = scores[keep]
+    labels = labels[keep]
+    bboxes = bboxes[keep]
+
+    return scores, labels, bboxes
+
+
+def multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes):
+    # nms
+    keep = np.zeros(len(bboxes), dtype=np.int32)
+    for i in range(num_classes):
+        inds = np.where(labels == i)[0]
+        if len(inds) == 0:
+            continue
+        c_bboxes = bboxes[inds]
+        c_scores = scores[inds]
+        c_keep = nms(c_bboxes, c_scores, nms_thresh)
+        keep[inds[c_keep]] = 1
+
+    keep = np.where(keep > 0)
+    scores = scores[keep]
+    labels = labels[keep]
+    bboxes = bboxes[keep]
+
+    return scores, labels, bboxes
+
+
+def multiclass_nms(scores, labels, bboxes, nms_thresh, num_classes, class_agnostic=False):
+    if class_agnostic:
+        return multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh)
+    else:
+        return multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes)
+
+
+##--------------------------------------------torch---------------------------------------
+# import torch
+
+# def nms(bboxes, scores, nms_thresh):
+#     """"Pure PyTorch NMS."""
+#     x1 = bboxes[:, 0]  #xmin
+#     y1 = bboxes[:, 1]  #ymin
+#     x2 = bboxes[:, 2]  #xmax
+#     y2 = bboxes[:, 3]  #ymax
+
+#     areas = (x2 - x1) * (y2 - y1)
+#     _, order = scores.sort(descending=True)
+
+#     keep = []
+#     while order.numel() > 0:
+#         i = order[0]
+#         keep.append(i.item())
+#         # compute iou
+#         xx1 = torch.max(x1[i], x1[order[1:]])
+#         yy1 = torch.max(y1[i], y1[order[1:]])
+#         xx2 = torch.min(x2[i], x2[order[1:]])
+#         yy2 = torch.min(y2[i], y2[order[1:]])
+
+#         w = torch.clamp(xx2 - xx1, min=1e-10)
+#         h = torch.clamp(yy2 - yy1, min=1e-10)
+#         inter = w * h
+
+#         iou = inter / (areas[i] + areas[order[1:]] - inter + 1e-14)
+#         #reserve all the boundingbox whose ovr less than thresh
+#         inds = torch.where(iou <= nms_thresh)[0]
+#         order = order[inds + 1]
+
+#     return keep
+
+
+# def multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh):
+#     # nms
+#     keep = nms(bboxes, scores, nms_thresh)
+
+#     scores = scores[keep]
+#     labels = labels[keep]
+#     bboxes = bboxes[keep]
+
+#     return scores, labels, bboxes
+
+
+# def multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes):
+#     # nms
+#     keep = torch.zeros(len(bboxes), dtype=torch.int32)
+#     for i in range(num_classes):
+#         inds = torch.where(labels == i)[0]
+#         if len(inds) == 0:
+#             continue
+#         c_bboxes = bboxes[inds]
+#         c_scores = scores[inds]
+#         c_keep = nms(c_bboxes, c_scores, nms_thresh)
+#         keep[inds[c_keep]] = 1
+
+#     keep = torch.where(keep > 0)
+#     scores = scores[keep]
+#     labels = labels[keep]
+#     bboxes = bboxes[keep]
+
+#     return scores, labels, bboxes
+
+
+# def multiclass_nms(scores, labels, bboxes, nms_thresh, num_classes, class_agnostic=False):
+#     if class_agnostic:
+#         return multiclass_nms_class_agnostic(scores, labels, bboxes, nms_thresh)
+#     else:
+#         return multiclass_nms_class_aware(scores, labels, bboxes, nms_thresh, num_classes)
diff --git a/utils/solver/__init__.py b/utils/solver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/utils/solver/__pycache__/__init__.cpython-310.pyc b/utils/solver/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ce213fab408458d1a41198f346575986d410cd9
Binary files /dev/null and b/utils/solver/__pycache__/__init__.cpython-310.pyc differ
diff --git a/utils/solver/__pycache__/optimizer.cpython-310.pyc b/utils/solver/__pycache__/optimizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0a43a663ccad03098f2b8907757a0518bb14f2f
Binary files /dev/null and b/utils/solver/__pycache__/optimizer.cpython-310.pyc differ
diff --git a/utils/solver/__pycache__/warmup_schedule.cpython-310.pyc b/utils/solver/__pycache__/warmup_schedule.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e0cef77cbed1a295c2654967ac719e56107fda4
Binary files /dev/null and b/utils/solver/__pycache__/warmup_schedule.cpython-310.pyc differ
diff --git a/utils/solver/optimizer.py b/utils/solver/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c26001188c6fee0b9157cb0b8afa7c6b729073
--- /dev/null
+++ b/utils/solver/optimizer.py
@@ -0,0 +1,40 @@
+import torch
+from torch import optim
+
+
+def build_optimizer(cfg, model, base_lr=0.0, resume=None):
+    print('==============================')
+    print('Optimizer: {}'.format(cfg['optimizer']))
+    print('--momentum: {}'.format(cfg['momentum']))
+    print('--weight_decay: {}'.format(cfg['weight_decay']))
+
+    if cfg['optimizer'] == 'sgd':
+        optimizer = optim.SGD(
+            model.parameters(), 
+            lr=base_lr,
+            momentum=cfg['momentum'],
+            weight_decay=cfg['weight_decay'])
+
+    elif cfg['optimizer'] == 'adam':
+        optimizer = optim.Adam(
+            model.parameters(), 
+            lr=base_lr,
+            eight_decay=cfg['weight_decay'])
+                                
+    elif cfg['optimizer'] == 'adamw':
+        optimizer = optim.AdamW(
+            model.parameters(), 
+            lr=base_lr,
+            weight_decay=cfg['weight_decay'])
+          
+    start_epoch = 0
+    if resume is not None:
+        print('keep training: ', resume)
+        checkpoint = torch.load(resume)
+        # checkpoint state dict
+        checkpoint_state_dict = checkpoint.pop("optimizer")
+        optimizer.load_state_dict(checkpoint_state_dict)
+        start_epoch = checkpoint.pop("epoch")
+                        
+                                
+    return optimizer, start_epoch
diff --git a/utils/solver/warmup_schedule.py b/utils/solver/warmup_schedule.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88398a840fda64c85be014c1531b2599316257b
--- /dev/null
+++ b/utils/solver/warmup_schedule.py
@@ -0,0 +1,58 @@
+
+# Build warmup scheduler
+
+
+def build_warmup(cfg, base_lr=0.01):
+    print('==============================')
+    print('WarmUpScheduler: {}'.format(cfg['warmup']))
+    print('--base_lr: {}'.format(base_lr))
+    print('--warmup_factor: {}'.format(cfg['warmup_factor']))
+    print('--wp_iter: {}'.format(cfg['wp_iter']))
+
+    warmup_scheduler = WarmUpScheduler(
+        name=cfg['warmup'], 
+        base_lr=base_lr, 
+        wp_iter=cfg['wp_iter'], 
+        warmup_factor=cfg['warmup_factor']
+        )
+    
+    return warmup_scheduler
+
+                           
+# Basic Warmup Scheduler
+class WarmUpScheduler(object):
+    def __init__(self, 
+                 name='linear', 
+                 base_lr=0.01, 
+                 wp_iter=500, 
+                 warmup_factor=0.00066667):
+        self.name = name
+        self.base_lr = base_lr
+        self.wp_iter = wp_iter
+        self.warmup_factor = warmup_factor
+
+
+    def set_lr(self, optimizer, lr, base_lr):
+        for param_group in optimizer.param_groups:
+            init_lr = param_group['initial_lr']
+            ratio = init_lr / base_lr
+            param_group['lr'] = lr * ratio
+
+
+    def warmup(self, iter, optimizer):
+        # warmup
+        assert iter < self.wp_iter
+        if self.name == 'exp':
+            tmp_lr = self.base_lr * pow(iter / self.wp_iter, 4)
+            self.set_lr(optimizer, tmp_lr, self.base_lr)
+
+        elif self.name == 'linear':
+            alpha = iter / self.wp_iter
+            warmup_factor = self.warmup_factor * (1 - alpha) + alpha
+            tmp_lr = self.base_lr * warmup_factor
+            self.set_lr(optimizer, tmp_lr, self.base_lr)
+
+
+    def __call__(self, iter, optimizer):
+        self.warmup(iter, optimizer)
+        
\ No newline at end of file
diff --git a/utils/vis_tools.py b/utils/vis_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5c48b614125ae081dde0f6c1804825f60d5478e
--- /dev/null
+++ b/utils/vis_tools.py
@@ -0,0 +1,87 @@
+import cv2
+import numpy as np
+
+
+def vis_targets(video_clips, targets):
+    """
+        video_clips: (Tensor) -> [B, C, T, H, W]
+        targets: List[Dict] -> [{'boxes': (Tensor) [N, 4],
+                                 'labels': (Tensor) [N,]}, 
+                                 ...],
+    """
+    batch_size = len(video_clips)
+
+    for batch_index in range(batch_size):
+        video_clip = video_clips[batch_index]
+        target = targets[batch_index]
+
+        key_frame = video_clip[:, :, -1, :, :]
+        tgt_bboxes = target['boxes']
+        tgt_labels = target['labels']
+
+        key_frame = convert_tensor_to_cv2img(key_frame)
+        width, height = key_frame.shape[:-1]
+
+        for box, label in zip(tgt_bboxes, tgt_labels):
+            x1, y1, x2, y2 = box
+            label = int(label)
+
+            x1 *= width
+            y1 *= height
+            x2 *= width
+            y2 *= height
+
+            # draw bbox
+            cv2.rectangle(key_frame,
+                            (int(x1), int(y1)),
+                            (int(x2), int(y2)),
+                            (255, 0, 0), 2)
+        cv2.imshow('groundtruth', key_frame)
+        cv2.waitKey(0)
+
+
+def convert_tensor_to_cv2img(img_tensor):
+    """ convert torch.Tensor to cv2 image """
+    # to numpy
+    img_tensor = img_tensor.permute(1, 2, 0).cpu().numpy()
+    # to cv2 img Mat
+    cv2_img = img_tensor.astype(np.uint8)
+    # to BGR
+    cv2_img = cv2_img.copy()[..., (2, 1, 0)]
+
+    return cv2_img
+
+
+def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4):
+    x1, y1, x2, y2 = bbox
+    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+    t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0]
+    # plot bbox
+    cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2)
+    
+    if label is not None:
+        # plot title bbox
+        cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1)
+        # put the test on the title bbox
+        cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA)
+
+    return img
+
+
+def vis_detection(frame, scores, labels, bboxes, vis_thresh, class_names, class_colors):
+    ts = 0.4
+    for i, bbox in enumerate(bboxes):
+        if scores[i] > vis_thresh:
+            label = int(labels[i])
+            cls_color = class_colors[label]
+                
+            if len(class_names) > 1:
+                mess = '%s: %.2f' % (class_names[label], scores[i])
+            else:
+                cls_color = [255, 0, 0]
+                mess = None
+                # visualize bbox
+            frame = plot_bbox_labels(frame, bbox, mess, cls_color, text_scale=ts)
+
+    return frame
+        
\ No newline at end of file
diff --git a/utils/weight_init.py b/utils/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fe88bee46036f3de10a4b3427e3095751ec6ab9
--- /dev/null
+++ b/utils/weight_init.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import math
+
+import torch.nn as nn
+
+
+def constant_init(module, val, bias=0):
+    nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.xavier_uniform_(module.weight, gain=gain)
+    else:
+        nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.kaiming_uniform_(module.weight,
+                                 a=a,
+                                 mode=mode,
+                                 nonlinearity=nonlinearity)
+    else:
+        nn.init.kaiming_normal_(module.weight,
+                                a=a,
+                                mode=mode,
+                                nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def caffe2_xavier_init(module, bias=0):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    kaiming_init(module,
+                 a=1,
+                 mode='fan_in',
+                 nonlinearity='leaky_relu',
+                 bias=bias,
+                 distribution='uniform')
+
+
+def c2_xavier_fill(module: nn.Module):
+    """
+    Initialize `module.weight` using the "XavierFill" implemented in Caffe2.
+    Also initializes `module.bias` to 0.
+
+    Args:
+        module (torch.nn.Module): module to initialize.
+    """
+    # Caffe2 implementation of XavierFill in fact
+    # corresponds to kaiming_uniform_ in PyTorch
+    nn.init.kaiming_uniform_(module.weight, a=1)
+    if module.bias is not None:
+        nn.init.constant_(module.bias, 0)
+
+
+def c2_msra_fill(module: nn.Module):
+    """
+    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2.
+    Also initializes `module.bias` to 0.
+
+    Args:
+        module (torch.nn.Module): module to initialize.
+    """
+    nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+    if module.bias is not None:
+        nn.init.constant_(module.bias, 0)
+
+
+def init_weights(m: nn.Module, zero_init_final_gamma=False):
+    """Performs ResNet-style weight initialization."""
+    if isinstance(m, nn.Conv2d):
+        # Note that there is no bias due to BN
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out))
+    elif isinstance(m, nn.BatchNorm2d):
+        zero_init_gamma = (
+            hasattr(m, "final_bn") and m.final_bn and zero_init_final_gamma
+        )
+        m.weight.data.fill_(0.0 if zero_init_gamma else 1.0)
+        m.bias.data.zero_()
+    elif isinstance(m, nn.Linear):
+        m.weight.data.normal_(mean=0.0, std=0.01)
+        m.bias.data.zero_()