Haisong Liu commited on Jan 3, 2024

Commit

102ac67

unverified ·

1 Parent(s): e610aaf

Release model: vit_eva02_1600x640_trainval_future (#46)

Browse files

Files changed (17) hide show

README.md +1 -0
configs/r50_nuimg_704x256.py +1 -1
configs/vit_eva02_1600x640_trainval_future.py +112 -0
loaders/pipelines/loading.py +135 -0
models/backbones/__init__.py +2 -1
models/backbones/eva02/__init__.py +1 -0
models/backbones/eva02/backbone.py +94 -0
models/backbones/eva02/batch_norm.py +214 -0
models/backbones/eva02/blocks.py +111 -0
models/backbones/eva02/drop.py +37 -0
models/backbones/eva02/fpn.py +50 -0
models/backbones/eva02/main.py +93 -0
models/backbones/eva02/utils.py +361 -0
models/backbones/eva02/vit.py +609 -0
models/backbones/eva02/wrappers.py +148 -0
models/sparsebev.py +4 -4
val.py +1 -0

README.md CHANGED Viewed

@@ -28,6 +28,7 @@ This is the official PyTorch implementation for our ICCV 2023 paper:
 | [r50_nuimg_704x256_400q_36ep](configs/r50_nuimg_704x256_400q_36ep.py) | [nuImg](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth) | 28h (8x2080Ti) | 55.8 | - | 23.5 | [gdrive](https://drive.google.com/file/d/1C_Vn3iiSnSW1Dw1r0DkjJMwvHC5Y3zTN/view) |
 | [r101_nuimg_1408x512](configs/r101_nuimg_1408x512.py) | [nuImg](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth) | 2d8h (8xV100) | 59.2 | - | 6.5 | [gdrive](https://drive.google.com/file/d/1dKu5cR1fuo-O0ynyBh-RCPtHrgut29mN/view) |
 | [vov99_dd3d_1600x640_trainval_future](configs/vov99_dd3d_1600x640_trainval_future.py) | [DD3D](https://drive.google.com/file/d/1gQkhWERCzAosBwG5bh2BKkt1k0TJZt-A/view) | 4d1h (8xA100) | 84.9 | 67.5 | - | [gdrive](https://drive.google.com/file/d/1TL0QoCiWD5uq8PCAWWE3A-g73ibK1R0S/view) |
 * We use `r50_nuimg_704x256` for ablation studies and `r50_nuimg_704x256_400q_36ep` for comparison with others.
 * We recommend using `r50_nuimg_704x256` to validate new ideas since it trains faster and the result is more stable.

 | [r50_nuimg_704x256_400q_36ep](configs/r50_nuimg_704x256_400q_36ep.py) | [nuImg](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth) | 28h (8x2080Ti) | 55.8 | - | 23.5 | [gdrive](https://drive.google.com/file/d/1C_Vn3iiSnSW1Dw1r0DkjJMwvHC5Y3zTN/view) |
 | [r101_nuimg_1408x512](configs/r101_nuimg_1408x512.py) | [nuImg](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth) | 2d8h (8xV100) | 59.2 | - | 6.5 | [gdrive](https://drive.google.com/file/d/1dKu5cR1fuo-O0ynyBh-RCPtHrgut29mN/view) |
 | [vov99_dd3d_1600x640_trainval_future](configs/vov99_dd3d_1600x640_trainval_future.py) | [DD3D](https://drive.google.com/file/d/1gQkhWERCzAosBwG5bh2BKkt1k0TJZt-A/view) | 4d1h (8xA100) | 84.9 | 67.5 | - | [gdrive](https://drive.google.com/file/d/1TL0QoCiWD5uq8PCAWWE3A-g73ibK1R0S/view) |
+| [vit_eva02_1600x640_trainval_future](configs/vit_eva02_1600x640_trainval_future.py) | [EVA02](https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_seg_sys_o365.pth) | 11d (8xA100) | 85.3 | 70.2 | - | [gdrive](https://drive.google.com/file/d/1cx7h6PUqiaVWPixpcuB9AhsX3Sx4n0q_/view) |
 * We use `r50_nuimg_704x256` for ablation studies and `r50_nuimg_704x256_400q_36ep` for comparison with others.
 * We recommend using `r50_nuimg_704x256` to validate new ideas since it trains faster and the result is more stable.

configs/r50_nuimg_704x256.py CHANGED Viewed

@@ -54,7 +54,7 @@ model = dict(
         img_color_aug=True,  # Move some augmentations to GPU
         img_norm_cfg=img_norm_cfg,
         img_pad_cfg=dict(size_divisor=32)),
-    stop_prev_grad=False,
     img_backbone=img_backbone,
     img_neck=img_neck,
     pts_bbox_head=dict(

         img_color_aug=True,  # Move some augmentations to GPU
         img_norm_cfg=img_norm_cfg,
         img_pad_cfg=dict(size_divisor=32)),
+    stop_prev_grad=0,
     img_backbone=img_backbone,
     img_neck=img_neck,
     pts_bbox_head=dict(

configs/vit_eva02_1600x640_trainval_future.py ADDED Viewed

	@@ -0,0 +1,112 @@

+_base_ = ['./r50_nuimg_704x256.py']
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+img_backbone = dict(
+    _delete_=True,
+    type='EVA02',
+    img_size=1536,
+    real_img_size=(640, 1600),
+    patch_size=16,
+    in_chans=3,
+    embed_dim=1024,
+    depth=24,
+    num_heads=16,
+    mlp_ratio=4*2/3,
+    qkv_bias=True,
+    drop_path_rate=0.3,
+    use_abs_pos=True,
+    window_size=16,
+    window_block_indexes=(
+        list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list(range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23))
+    ),
+    residual_block_indexes=(),
+    use_act_checkpoint=True,
+    # args for simple FPN
+    fpn_out_channels=256,
+    fpn_scale_factors=(4.0, 2.0, 1.0, 0.5),
+    fpn_top_block=True,
+    fpn_norm="LN",
+    fpn_square_pad=1600,
+    pretrained='pretrain/eva02_L_coco_seg_sys_o365.pth',
+    frozen_blocks=3,
+)
+img_norm_cfg = dict(
+    mean=[123.675, 116.280, 103.530],
+    std=[58.395, 57.120, 57.375],
+    to_rgb=True
+)
+model = dict(
+    img_backbone=img_backbone,
+    img_neck=None,
+    stop_prev_grad=4,
+    pts_bbox_head=dict(
+        num_query=1600,
+        transformer=dict(
+            num_levels=5,
+            num_points=8,
+            num_frames=15))
+)
+ida_aug_conf = {
+    'resize_lim': (0.94, 1.25),
+    'final_dim': (640, 1600),
+    'bot_pct_lim': (0.0, 0.0),
+    'rot_lim': (0.0, 0.0),
+    'H': 900, 'W': 1600,
+    'rand_flip': True,
+}
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
+    dict(type='LoadMultiViewImageFromMultiSweepsFutureInterleave', prev_sweeps_num=7, next_sweeps_num=7),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=True),
+    dict(type='GlobalRotScaleTransImage', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05]),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], meta_keys=(
+        'filename', 'ori_shape', 'img_shape', 'pad_shape', 'lidar2img', 'img_timestamp'))
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=False, color_type='color'),
+    dict(type='LoadMultiViewImageFromMultiSweepsFutureInterleave', prev_sweeps_num=7, next_sweeps_num=7, test_mode=True),
+    dict(type='RandomTransformImage', ida_aug_conf=ida_aug_conf, training=False),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='DefaultFormatBundle3D', class_names=class_names, with_label=False),
+            dict(type='Collect3D', keys=['img'], meta_keys=(
+                'filename', 'box_type_3d', 'ori_shape', 'img_shape', 'pad_shape',
+                'lidar2img', 'img_timestamp'))
+        ])
+]
+data = dict(
+    train=dict(
+        ann_file=['data/nuscenes/nuscenes_infos_train_sweep.pkl',
+                  'data/nuscenes/nuscenes_infos_val_sweep.pkl'],
+        pipeline=train_pipeline),
+    val=dict(
+        ann_file='data/nuscenes/nuscenes_infos_val_sweep.pkl',  # use nuscenes_infos_test_sweep.pkl for submission
+        pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline)
+)
+load_from = None
+revise_keys = None

loaders/pipelines/loading.py CHANGED Viewed

@@ -255,3 +255,138 @@ class LoadMultiViewImageFromMultiSweepsFuture(object):
                     ))
         return results

                     ))
         return results
+'''
+This func loads previous and future frames in interleaved order,
+e.g. curr, prev1, next1, prev2, next2, prev3, next3...
+'''
+@PIPELINES.register_module()
+class LoadMultiViewImageFromMultiSweepsFutureInterleave(object):
+    def __init__(self,
+                 prev_sweeps_num=5,
+                 next_sweeps_num=5,
+                 color_type='color',
+                 test_mode=False):
+        self.prev_sweeps_num = prev_sweeps_num
+        self.next_sweeps_num = next_sweeps_num
+        self.color_type = color_type
+        self.test_mode = test_mode
+        assert prev_sweeps_num == next_sweeps_num
+        self.train_interval = [4, 8]
+        self.test_interval = 6
+        try:
+            mmcv.use_backend('turbojpeg')
+        except ImportError:
+            mmcv.use_backend('cv2')
+    def __call__(self, results):
+        if self.prev_sweeps_num == 0 and self.next_sweeps_num == 0:
+            return results
+        cam_types = [
+            'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT',
+            'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT'
+        ]
+        if self.test_mode:
+            interval = self.test_interval
+        else:
+            interval = np.random.randint(self.train_interval[0], self.train_interval[1] + 1)
+        results_prev = dict(
+            img=[],
+            img_timestamp=[],
+            filename=[],
+            lidar2img=[]
+        )
+        results_next = dict(
+            img=[],
+            img_timestamp=[],
+            filename=[],
+            lidar2img=[]
+        )
+        if len(results['sweeps']['prev']) == 0:
+            for _ in range(self.prev_sweeps_num):
+                for j in range(len(cam_types)):
+                    results_prev['img'].append(results['img'][j])
+                    results_prev['img_timestamp'].append(results['img_timestamp'][j])
+                    results_prev['filename'].append(results['filename'][j])
+                    results_prev['lidar2img'].append(np.copy(results['lidar2img'][j]))
+        else:
+            choices = [(k + 1) * interval - 1 for k in range(self.prev_sweeps_num)]
+            for idx in sorted(list(choices)):
+                sweep_idx = min(idx, len(results['sweeps']['prev']) - 1)
+                sweep = results['sweeps']['prev'][sweep_idx]
+                if len(sweep.keys()) < len(cam_types):
+                    sweep = results['sweeps']['prev'][sweep_idx - 1]
+                for sensor in cam_types:
+                    results_prev['img'].append(mmcv.imread(sweep[sensor]['data_path'], self.color_type))
+                    results_prev['img_timestamp'].append(sweep[sensor]['timestamp'] / 1e6)
+                    results_prev['filename'].append(os.path.relpath(sweep[sensor]['data_path']))
+                    results_prev['lidar2img'].append(compose_lidar2img(
+                        results['ego2global_translation'],
+                        results['ego2global_rotation'],
+                        results['lidar2ego_translation'],
+                        results['lidar2ego_rotation'],
+                        sweep[sensor]['sensor2global_translation'],
+                        sweep[sensor]['sensor2global_rotation'],
+                        sweep[sensor]['cam_intrinsic'],
+                    ))
+        if len(results['sweeps']['next']) == 0:
+            print(1, len(results_next['img']) )
+            for _ in range(self.next_sweeps_num):
+                for j in range(len(cam_types)):
+                    results_next['img'].append(results['img'][j])
+                    results_next['img_timestamp'].append(results['img_timestamp'][j])
+                    results_next['filename'].append(results['filename'][j])
+                    results_next['lidar2img'].append(np.copy(results['lidar2img'][j]))
+        else:
+            choices = [(k + 1) * interval - 1 for k in range(self.next_sweeps_num)]
+            for idx in sorted(list(choices)):
+                sweep_idx = min(idx, len(results['sweeps']['next']) - 1)
+                sweep = results['sweeps']['next'][sweep_idx]
+                if len(sweep.keys()) < len(cam_types):
+                    sweep = results['sweeps']['next'][sweep_idx - 1]
+                for sensor in cam_types:
+                    results_next['img'].append(mmcv.imread(sweep[sensor]['data_path'], self.color_type))
+                    results_next['img_timestamp'].append(sweep[sensor]['timestamp'] / 1e6)
+                    results_next['filename'].append(os.path.relpath(sweep[sensor]['data_path']))
+                    results_next['lidar2img'].append(compose_lidar2img(
+                        results['ego2global_translation'],
+                        results['ego2global_rotation'],
+                        results['lidar2ego_translation'],
+                        results['lidar2ego_rotation'],
+                        sweep[sensor]['sensor2global_translation'],
+                        sweep[sensor]['sensor2global_rotation'],
+                        sweep[sensor]['cam_intrinsic'],
+                    ))
+        assert len(results_prev['img']) % 6 == 0
+        assert len(results_next['img']) % 6 == 0
+        for i in range(len(results_prev['img']) // 6):
+            for j in range(6):
+                results['img'].append(results_prev['img'][i * 6 + j])
+                results['img_timestamp'].append(results_prev['img_timestamp'][i * 6 + j])
+                results['filename'].append(results_prev['filename'][i * 6 + j])
+                results['lidar2img'].append(results_prev['lidar2img'][i * 6 + j])
+            for j in range(6):
+                results['img'].append(results_next['img'][i * 6 + j])
+                results['img_timestamp'].append(results_next['img_timestamp'][i * 6 + j])
+                results['filename'].append(results_next['filename'][i * 6 + j])
+                results['lidar2img'].append(results_next['lidar2img'][i * 6 + j])
+        return results

models/backbones/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .vovnet import VoVNet
-__all__ = ['VoVNet']

 from .vovnet import VoVNet
+from .eva02 import EVA02
+__all__ = ['VoVNet', 'EVA02']

models/backbones/eva02/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .main import EVA02

models/backbones/eva02/backbone.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class ShapeSpec:
+    """
+    A simple structure that contains basic shape specification about a tensor.
+    It is often used as the auxiliary inputs/outputs of models,
+    to complement the lack of shape inference ability among pytorch modules.
+    """
+    channels: Optional[int] = None
+    height: Optional[int] = None
+    width: Optional[int] = None
+    stride: Optional[int] = None
+# Copyright (c) Facebook, Inc. and its affiliates.
+from abc import ABCMeta, abstractmethod
+from typing import Dict
+import torch.nn as nn
+__all__ = ["Backbone"]
+class Backbone(nn.Module, metaclass=ABCMeta):
+    """
+    Abstract base class for network backbones.
+    """
+    def __init__(self):
+        """
+        The `__init__` method of any subclass can specify its own set of arguments.
+        """
+        super().__init__()
+    @abstractmethod
+    def forward(self):
+        """
+        Subclasses must override this method, but adhere to the same return type.
+        Returns:
+            dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor
+        """
+        pass
+    @property
+    def size_divisibility(self) -> int:
+        """
+        Some backbones require the input height and width to be divisible by a
+        specific integer. This is typically true for encoder / decoder type networks
+        with lateral connection (e.g., FPN) for which feature maps need to match
+        dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
+        input size divisibility is required.
+        """
+        return 0
+    @property
+    def padding_constraints(self) -> Dict[str, int]:
+        """
+        This property is a generalization of size_divisibility. Some backbones and training
+        recipes require specific padding constraints, such as enforcing divisibility by a specific
+        integer (e.g., FPN) or padding to a square (e.g., ViTDet with large-scale jitter
+        in :paper:vitdet). `padding_constraints` contains these optional items like:
+        {
+            "size_divisibility": int,
+            "square_size": int,
+            # Future options are possible
+        }
+        `size_divisibility` will read from here if presented and `square_size` indicates the
+        square padding size if `square_size` > 0.
+        TODO: use type of Dict[str, int] to avoid torchscipt issues. The type of padding_constraints
+        could be generalized as TypedDict (Python 3.8+) to support more types in the future.
+        """
+        return {}
+    def output_shape(self):
+        """
+        Returns:
+            dict[str->ShapeSpec]
+        """
+        # this is a backward-compatible default
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }

models/backbones/eva02/batch_norm.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+import torch.distributed as dist
+from fvcore.nn.distributed import differentiable_all_reduce
+from torch import nn
+from torch.nn import functional as F
+from .wrappers import BatchNorm2d
+class FrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+    The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+    which are computed from the original four parameters of BN.
+    The affine transform `x * weight + bias` will perform the equivalent
+    computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+    When loading a backbone model from Caffe2, "running_mean" and "running_var"
+    will be left unchanged as identity transformation.
+    Other pre-trained backbone models may contain all 4 parameters.
+    The forward is implemented by `F.batch_norm(..., training=False)`.
+    """
+    _version = 3
+    def __init__(self, num_features, eps=1e-5):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features) - eps)
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            out_dtype = x.dtype  # may be half
+            return x * scale.to(out_dtype) + bias.to(out_dtype)
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # No running_mean/var in early versions
+            # This will silent the warnings
+            if prefix + "running_mean" not in state_dict:
+                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+            if prefix + "running_var" not in state_dict:
+                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+    def __repr__(self):
+        return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """
+        Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+        Args:
+            module (torch.nn.Module):
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "BN": BatchNorm2d,
+            # Fixed in https://github.com/pytorch/pytorch/pull/36382
+            "SyncBN": nn.SyncBatchNorm,
+            "FrozenBN": FrozenBatchNorm2d,
+            "GN": lambda channels: nn.GroupNorm(32, channels),
+            # for debugging:
+            "nnSyncBN": nn.SyncBatchNorm,
+            "LN": lambda channels: LayerNorm(channels)
+        }[norm]
+    return norm(out_channels)
+class CycleBatchNormList(nn.ModuleList):
+    """
+    Implement domain-specific BatchNorm by cycling.
+    When a BatchNorm layer is used for multiple input domains or input
+    features, it might need to maintain a separate test-time statistics
+    for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`.
+    This module implements it by using N separate BN layers
+    and it cycles through them every time a forward() is called.
+    NOTE: The caller of this module MUST guarantee to always call
+    this module by multiple of N times. Otherwise its test-time statistics
+    will be incorrect.
+    """
+    def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs):
+        """
+        Args:
+            length: number of BatchNorm layers to cycle.
+            bn_class: the BatchNorm class to use
+            kwargs: arguments of the BatchNorm class, such as num_features.
+        """
+        self._affine = kwargs.pop("affine", True)
+        super().__init__([bn_class(**kwargs, affine=False) for k in range(length)])
+        if self._affine:
+            # shared affine, domain-specific BN
+            channels = self[0].num_features
+            self.weight = nn.Parameter(torch.ones(channels))
+            self.bias = nn.Parameter(torch.zeros(channels))
+        self._pos = 0
+    def forward(self, x):
+        ret = self[self._pos](x)
+        self._pos = (self._pos + 1) % len(self)
+        if self._affine:
+            w = self.weight.reshape(1, -1, 1, 1)
+            b = self.bias.reshape(1, -1, 1, 1)
+            return ret * w + b
+        else:
+            return ret
+    def extra_repr(self):
+        return f"affine={self._affine}"
+class LayerNorm(nn.Module):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+    variance normalization over the channel dimension for inputs that have shape
+    (batch_size, channels, height, width).
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa B950
+    """
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape,)
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x

models/backbones/eva02/blocks.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+from .batch_norm import FrozenBatchNorm2d, get_norm
+from .wrappers import Conv2d
+"""
+CNN building blocks.
+"""
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NCHW tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+    def freeze(self):
+        """
+        Make this block not trainable.
+        This method sets all parameters to `requires_grad=False`,
+        and convert all BatchNorm layers to FrozenBatchNorm
+        Returns:
+            the block itself
+        """
+        for p in self.parameters():
+            p.requires_grad = False
+        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+        return self
+class DepthwiseSeparableConv2d(nn.Module):
+    """
+    A kxk depthwise convolution + a 1x1 convolution.
+    In :paper:`xception`, norm & activation are applied on the second conv.
+    :paper:`mobilenet` uses norm & activation on both convs.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        padding=1,
+        dilation=1,
+        *,
+        norm1=None,
+        activation1=None,
+        norm2=None,
+        activation2=None,
+    ):
+        """
+        Args:
+            norm1, norm2 (str or callable): normalization for the two conv layers.
+            activation1, activation2 (callable(Tensor) -> Tensor): activation
+                function for the two conv layers.
+        """
+        super().__init__()
+        self.depthwise = Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=not norm1,
+            norm=get_norm(norm1, in_channels),
+            activation=activation1,
+        )
+        self.pointwise = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=not norm2,
+            norm=get_norm(norm2, out_channels),
+            activation=activation2,
+        )
+        # default initialization
+        weight_init.c2_msra_fill(self.depthwise)
+        weight_init.c2_msra_fill(self.pointwise)
+    def forward(self, x):
+        return self.pointwise(self.depthwise(x))

models/backbones/eva02/drop.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+import torch.nn as nn
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'

models/backbones/eva02/fpn.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import fvcore.nn.weight_init as weight_init
+import torch.nn.functional as F
+from torch import nn
+def _assert_strides_are_log2_contiguous(strides):
+    """
+    Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
+    """
+    for i, stride in enumerate(strides[1:], 1):
+        assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(
+            stride, strides[i - 1]
+        )
+class LastLevelMaxPool(nn.Module):
+    """
+    This module is used in the original FPN to generate a downsampled
+    P6 feature from P5.
+    """
+    def __init__(self):
+        super().__init__()
+        self.num_levels = 1
+        self.in_feature = "p5"
+    def forward(self, x):
+        return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
+class LastLevelP6P7(nn.Module):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7 from
+    C5 feature.
+    """
+    def __init__(self, in_channels, out_channels, in_feature="res5"):
+        super().__init__()
+        self.num_levels = 2
+        self.in_feature = in_feature
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            weight_init.c2_xavier_fill(module)
+    def forward(self, c5):
+        p6 = self.p6(c5)
+        p7 = self.p7(F.relu(p6))
+        return [p6, p7]

models/backbones/eva02/main.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import logging
+import torch
+import torch.nn as nn
+from mmcv.runner.checkpoint import load_state_dict
+from mmdet.models.builder import BACKBONES
+from .vit import ViT, SimpleFeaturePyramid, partial
+from .fpn import LastLevelMaxPool
+@BACKBONES.register_module()
+class EVA02(nn.Module):
+    def __init__(
+        self,
+        # args for ViT
+        img_size=1024,
+        real_img_size=(256, 704),
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4*2/3,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_abs_pos=True,
+        pt_hw_seq_len=16,
+        intp_freq=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+        xattn=False,
+        frozen_blocks=-1,
+        # args for simple FPN
+        fpn_in_feature="last_feat",
+        fpn_out_channels=256,
+        fpn_scale_factors=(4.0, 2.0, 1.0, 0.5),
+        fpn_top_block=False,
+        fpn_norm="LN",
+        fpn_square_pad=0,
+        pretrained=None
+    ):
+        super().__init__()
+        self.backbone = SimpleFeaturePyramid(
+            ViT(
+                img_size=img_size,
+                real_img_size=real_img_size,
+                patch_size=patch_size,
+                in_chans=in_chans,
+                embed_dim=embed_dim,
+                depth=depth,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path_rate=drop_path_rate,
+                norm_layer=norm_layer,
+                use_abs_pos=use_abs_pos,
+                pt_hw_seq_len=pt_hw_seq_len,
+                intp_freq=intp_freq,
+                window_size=window_size,
+                window_block_indexes=window_block_indexes,
+                residual_block_indexes=residual_block_indexes,
+                use_act_checkpoint=use_act_checkpoint,
+                pretrain_img_size=pretrain_img_size,
+                pretrain_use_cls_token=pretrain_use_cls_token,
+                out_feature=out_feature,
+                xattn=xattn,
+                frozen_blocks=frozen_blocks,
+            ),
+            in_feature=fpn_in_feature,
+            out_channels=fpn_out_channels,
+            scale_factors=fpn_scale_factors,
+            top_block=LastLevelMaxPool() if fpn_top_block else None,
+            norm=fpn_norm,
+            square_pad=fpn_square_pad,
+        )
+        self.init_weights(pretrained)
+    def init_weights(self, pretrained=None):
+        if pretrained is None:
+            return
+        logging.info('Loading pretrained weights from %s' % pretrained)
+        state_dict = torch.load(pretrained)['model']
+        load_state_dict(self, state_dict, strict=False)
+    def forward(self, x):
+        outs = self.backbone(x)
+        return list(outs.values())

models/backbones/eva02/utils.py ADDED Viewed

	@@ -0,0 +1,361 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import math
+import numpy as np
+from scipy import interpolate
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = [
+    "window_partition",
+    "window_unpartition",
+    "add_decomposed_rel_pos",
+    "get_abs_pos",
+    "PatchEmbed",
+    "VisionRotaryEmbeddingFast",
+]
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    use_log_interpolation = True
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        if not use_log_interpolation:
+            # Interpolate rel pos.
+            rel_pos_resized = F.interpolate(
+                rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+                size=max_rel_dist,
+                mode="linear",
+            )
+            rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+        else:
+            src_size = rel_pos.shape[0]
+            dst_size = max_rel_dist
+            # q = 1.13492
+            q = 1.0903078
+            dis = []
+            cur = 1
+            for i in range(src_size // 2):
+                dis.append(cur)
+                cur += q ** (i + 1)
+            r_ids = [-_ for _ in reversed(dis)]
+            x = r_ids + [0] + dis
+            t = dst_size // 2.0
+            dx = np.arange(-t, t + 0.1, 1.0)
+            # print("x = %s" % str(x))
+            # print("dx = %s" % str(dx))
+            all_rel_pos_bias = []
+            for i in range(rel_pos.shape[1]):
+                z = rel_pos[:, i].view(src_size).cpu().float().numpy()
+                f = interpolate.interp1d(x, z, kind='cubic', fill_value="extrapolate")
+                all_rel_pos_bias.append(
+                    torch.Tensor(f(dx)).contiguous().view(-1, 1).to(rel_pos.device))
+            rel_pos_resized = torch.cat(all_rel_pos_bias, dim=-1)
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+def get_abs_pos(abs_pos, has_cls_token, hw):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
+from math import pi
+import torch
+from torch import nn
+from einops import rearrange, repeat
+def broadcat(tensors, dim = -1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim = dim)
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        if ft_seq_len is None: ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs_h = torch.einsum('..., f -> ... f', t, freqs)
+        freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
+        freqs_w = torch.einsum('..., f -> ... f', t, freqs)
+        freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
+        freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim = -1)
+        self.register_buffer("freqs_cos", freqs.cos())
+        self.register_buffer("freqs_sin", freqs.sin())
+        print('======== shape of rope freq', self.freqs_cos.shape, '========')
+    def forward(self, t, start_index = 0):
+        rot_dim = self.freqs_cos.shape[-1]
+        end_index = start_index + rot_dim
+        assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+        t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+        t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
+        return torch.cat((t_left, t, t_right), dim = -1)
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len=16,
+        ft_seq_len=None,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+        real_img_size = None
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        if ft_seq_len is None: ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs = torch.einsum('..., f -> ... f', t, freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)
+        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
+        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
+        if real_img_size is not None:
+            new_freqs_cos = F.interpolate(
+                freqs_cos.reshape(1, ft_seq_len, ft_seq_len, -1).permute(0, 3, 1, 2),
+                size=real_img_size,
+                mode="bicubic",
+                align_corners=False,
+            ).permute(0, 2, 3, 1)
+            new_freqs_sin = F.interpolate(
+                freqs_sin.reshape(1, ft_seq_len, ft_seq_len, -1).permute(0, 3, 1, 2),
+                size=real_img_size,
+                mode="bicubic",
+                align_corners=False,
+            ).permute(0, 2, 3, 1)
+            self.register_buffer("freqs_cos", new_freqs_cos.view(-1, freqs.shape[-1]))
+            self.register_buffer("freqs_sin", new_freqs_sin.view(-1, freqs.shape[-1]))
+        else:
+            self.register_buffer("freqs_cos", freqs_cos)
+            self.register_buffer("freqs_sin", freqs_sin)
+    def forward(self, t):
+        return t * self.freqs_cos[:, None, :] + rotate_half(t) * self.freqs_sin[:, None, :]

models/backbones/eva02/vit.py ADDED Viewed

	@@ -0,0 +1,609 @@

+import logging
+import math
+from functools import partial
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from .wrappers import Conv2d
+from .batch_norm import get_norm
+from .blocks import CNNBlockBase
+from .fpn import _assert_strides_are_log2_contiguous
+from .backbone import Backbone
+from .utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+    VisionRotaryEmbeddingFast,
+)
+try:
+    from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+except ImportError:
+    flash_attn_func = None
+logger = logging.getLogger(__name__)
+class SwiGLU(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.,
+                norm_layer=nn.LayerNorm, subln=False
+            ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w1 = nn.Linear(in_features, hidden_features)
+        self.w2 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        self.w3 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = self.act(x1) * x2
+        x = self.ffn_ln(hidden)
+        x = self.w3(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            qkv_bias=True,
+            qk_scale=None,
+            attn_head_dim=None,
+            rope=None,
+            xattn=True,
+        ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
+        self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
+        self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.rope = rope
+        self.xattn = xattn
+        self.proj = nn.Linear(all_head_dim, dim)
+    def forward(self, x):
+        B, H, W, C = x.shape
+        x = x.view(B, -1, C)
+        N = H * W
+        q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+        k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
+        v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+        q = q.reshape(B, N, self.num_heads, -1)
+        k = k.reshape(B, N, self.num_heads, -1)
+        v = v.reshape(B, N, self.num_heads, -1)
+        ## rope
+        q = self.rope(q).type_as(v)
+        k = self.rope(k).type_as(v)
+        if self.xattn:
+            x = flash_attn_func(q, k, v).reshape(B, N, -1)
+        else:
+            q = q.permute(0, 2, 1, 3)  # B, num_heads, N, C
+            k = k.permute(0, 2, 1, 3)  # B, num_heads, N, C
+            v = v.permute(0, 2, 1, 3)  # B, num_heads, N, C
+            x = F.scaled_dot_product_attention(q, k, v)
+            x = x.transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = x.view(B, H, W, C)
+        return x
+class ResBottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block without the last activation layer.
+    It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bottleneck_channels,
+        norm="LN",
+        act_layer=nn.GELU,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            act_layer (callable): activation for all conv layers.
+        """
+        super().__init__(in_channels, out_channels, 1)
+        self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = get_norm(norm, bottleneck_channels)
+        self.act1 = act_layer()
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            padding=1,
+            bias=False,
+        )
+        self.norm2 = get_norm(norm, bottleneck_channels)
+        self.act2 = act_layer()
+        self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = get_norm(norm, out_channels)
+        for layer in [self.conv1, self.conv2, self.conv3]:
+            weight_init.c2_msra_fill(layer)
+        for layer in [self.norm1, self.norm2]:
+            layer.weight.data.fill_(1.0)
+            layer.bias.data.zero_()
+        # zero init last norm layer.
+        self.norm3.weight.data.zero_()
+        self.norm3.bias.data.zero_()
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+        out = x + out
+        return out
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4*2/3,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        window_size=0,
+        use_residual_block=False,
+        rope=None,
+        xattn=True,
+        use_act_checkpoint=True,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            rope=rope,
+            xattn=xattn,
+        )
+        from .drop import DropPath
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = SwiGLU(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            subln=True,
+            norm_layer=norm_layer,
+        )
+        self.window_size = window_size
+        self.use_residual_block = use_residual_block
+        if use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = ResBottleneckBlock(
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+                norm="LN",
+            )
+        self.use_act_checkpoint = use_act_checkpoint
+    def inner_forward(self, x):
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        return x
+    def forward(self, x):
+        if self.training and x.requires_grad and self.use_act_checkpoint:
+            return cp.checkpoint(self.inner_forward, x)
+        else:
+            return self.inner_forward(x)
+class ViT(Backbone):
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
+    """
+    def __init__(
+        self,
+        img_size=1024,
+        real_img_size=(256, 704),
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4*2/3,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_abs_pos=True,
+        pt_hw_seq_len=16,
+        intp_freq=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=False,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+        xattn=True,
+        frozen_blocks=-1,
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_feature (str): name of the feature from the last block.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.frozen_blocks = frozen_blocks
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+        half_head_dim = embed_dim // num_heads // 2
+        hw_seq_len = img_size // patch_size
+        real_img_size = (real_img_size[0] // patch_size, real_img_size[1] // patch_size)
+        self.rope_win = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=window_size if intp_freq else None,
+        )
+        self.rope_glb = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=hw_seq_len if intp_freq else None,
+            real_img_size=real_img_size
+        )
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                window_size=window_size if i in window_block_indexes else 0,
+                use_residual_block=i in residual_block_indexes,
+                rope=self.rope_win if i in window_block_indexes else self.rope_glb,
+                xattn=xattn,
+                use_act_checkpoint=use_act_checkpoint
+            )
+            self.blocks.append(block)
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+        if self.pos_embed is not None:
+            nn.init.trunc_normal_(self.pos_embed, std=0.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(
+                self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
+            )
+        for blk in self.blocks:
+            x = blk(x)
+        outputs = {self._out_features[0]: x.permute(0, 3, 1, 2)}
+        return outputs
+    def _freeze_stages(self):
+        def freeze_module(m):
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+        if self.frozen_blocks >= 0:
+            freeze_module(self.patch_embed)
+            self.pos_embed.requires_grad=False
+        for i in range(0, self.frozen_blocks):
+            freeze_module(self.blocks[i])
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+class SimpleFeaturePyramid(Backbone):
+    """
+    This module implements SimpleFeaturePyramid in :paper:`vitdet`.
+    It creates pyramid features built on top of the input feature map.
+    """
+    def __init__(
+        self,
+        net,
+        in_feature,
+        out_channels,
+        scale_factors,
+        top_block=None,
+        norm="LN",
+        square_pad=0,
+    ):
+        """
+        Args:
+            net (Backbone): module representing the subnetwork backbone.
+                Must be a subclass of :class:`Backbone`.
+            in_feature (str): names of the input feature maps coming
+                from the net.
+            out_channels (int): number of channels in the output feature maps.
+            scale_factors (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features.
+            top_block (nn.Module or None): if provided, an extra operation will
+                be performed on the output of the last (smallest resolution)
+                pyramid output, and the result will extend the result list. The top_block
+                further downsamples the feature map. It must have an attribute
+                "num_levels", meaning the number of extra pyramid levels added by
+                this block, and "in_feature", which is a string representing
+                its input feature (e.g., p5).
+            norm (str): the normalization to use.
+            square_pad (int): If > 0, require input images to be padded to specific square size.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+        assert isinstance(net, Backbone)
+        self.scale_factors = scale_factors
+        input_shapes = net.output_shape()
+        strides = [int(input_shapes[in_feature].stride / scale) for scale in scale_factors]
+        _assert_strides_are_log2_contiguous(strides)
+        dim = input_shapes[in_feature].channels
+        self.stages = []
+        use_bias = norm == ""
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2),
+                    get_norm(norm, dim // 2),
+                    nn.GELU(),
+                    nn.ConvTranspose2d(dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [nn.ConvTranspose2d(dim, dim // 2, kernel_size=2, stride=2)]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2d(kernel_size=2, stride=2)]
+            else:
+                raise NotImplementedError(f"scale_factor={scale} is not supported yet.")
+            layers.extend(
+                [
+                    Conv2d(
+                        out_dim,
+                        out_channels,
+                        kernel_size=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                    Conv2d(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                    ),
+                ]
+            )
+            layers = nn.Sequential(*layers)
+            stage = int(math.log2(strides[idx]))
+            self.add_module(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+        self.net = net
+        self.in_feature = in_feature
+        self.top_block = top_block
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        self._square_pad = square_pad
+    @property
+    def padding_constraints(self):
+        return {
+            "size_divisiblity": self._size_divisibility,
+            "square_size": self._square_pad,
+        }
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to pyramid feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.net(x)
+        features = bottom_up_features[self.in_feature]
+        results = []
+        for stage in self.stages:
+            results.append(stage(features))
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone"):
+        if ".pos_embed" in name or ".patch_embed" in name:
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)

models/backbones/eva02/wrappers.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Wrappers around on some nn functions, mainly to support empty tensors.
+Ideally, add support directly in PyTorch to empty tensors in those functions.
+These can be removed once https://github.com/pytorch/pytorch/issues/12013
+is implemented
+"""
+import warnings
+from typing import List, Optional
+import torch
+from torch.nn import functional as F
+def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor:
+    """
+    Turn a list of integer scalars or integer Tensor scalars into a vector,
+    in a way that's both traceable and scriptable.
+    In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs.
+    In scripting or eager, `x` should be a list of int.
+    """
+    if torch.jit.is_scripting():
+        return torch.as_tensor(x, device=device)
+    if torch.jit.is_tracing():
+        assert all(
+            [isinstance(t, torch.Tensor) for t in x]
+        ), "Shape should be tensor during tracing!"
+        # as_tensor should not be used in tracing because it records a constant
+        ret = torch.stack(x)
+        if ret.device != device:  # avoid recording a hard-coded device if not necessary
+            ret = ret.to(device=device)
+        return ret
+    return torch.as_tensor(x, device=device)
+def cat(tensors: List[torch.Tensor], dim: int = 0):
+    """
+    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
+    """
+    assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
+def empty_input_loss_func_wrapper(loss_func):
+    def wrapped_loss_func(input, target, *, reduction="mean", **kwargs):
+        """
+        Same as `loss_func`, but returns 0 (instead of nan) for empty inputs.
+        """
+        if target.numel() == 0 and reduction == "mean":
+            return input.sum() * 0.0  # connect the gradient
+        return loss_func(input, target, reduction=reduction, **kwargs)
+    return wrapped_loss_func
+cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy)
+class _NewEmptyTensorOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return _NewEmptyTensorOp.apply(grad, shape), None
+class Conv2d(torch.nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+        self.norm = norm
+        self.activation = activation
+    def forward(self, x):
+        # torchscript does not support SyncBatchNorm yet
+        # https://github.com/pytorch/pytorch/issues/40507
+        # and we skip these codes in torchscript since:
+        # 1. currently we only support torchscript in evaluation mode
+        # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
+        # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
+        if not torch.jit.is_scripting():
+            with warnings.catch_warnings(record=True):
+                if x.numel() == 0 and self.training:
+                    # https://github.com/pytorch/pytorch/issues/12013
+                    assert not isinstance(
+                        self.norm, torch.nn.SyncBatchNorm
+                    ), "SyncBatchNorm does not support empty inputs!"
+        x = F.conv2d(
+            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+ConvTranspose2d = torch.nn.ConvTranspose2d
+BatchNorm2d = torch.nn.BatchNorm2d
+interpolate = F.interpolate
+Linear = torch.nn.Linear
+def nonzero_tuple(x):
+    """
+    A 'as_tuple=True' version of torch.nonzero to support torchscript.
+    because of https://github.com/pytorch/pytorch/issues/38718
+    """
+    if torch.jit.is_scripting():
+        if x.dim() == 0:
+            return x.unsqueeze(0).nonzero().unbind(1)
+        return x.nonzero().unbind(1)
+    else:
+        return x.nonzero(as_tuple=True)
+@torch.jit.script_if_tracing
+def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor:
+    """
+    Tracing friendly way to cast tensor to another tensor's device. Device will be treated
+    as constant during tracing, scripting the casting process as whole can workaround this issue.
+    """
+    return src.to(dst.device)

models/sparsebev.py CHANGED Viewed

@@ -14,7 +14,7 @@ from .utils import GridMask, pad_multiple, GpuPhotoMetricDistortion
 class SparseBEV(MVXTwoStageDetector):
     def __init__(self,
                  data_aug=None,
-                 stop_prev_grad=False,
                  pts_voxel_layer=None,
                  pts_voxel_encoder=None,
                  pts_middle_encoder=None,
@@ -99,12 +99,12 @@ class SparseBEV(MVXTwoStageDetector):
         for img_meta in img_metas:
             img_meta.update(input_shape=input_shape)
-        if self.training and self.stop_prev_grad:
             H, W = input_shape
             img = img.reshape(B, -1, 6, C, H, W)
-            img_grad = img[:, :1]
-            img_nograd = img[:, 1:]
             all_img_feats = [self.extract_img_feat(img_grad.reshape(-1, C, H, W))]

 class SparseBEV(MVXTwoStageDetector):
     def __init__(self,
                  data_aug=None,
+                 stop_prev_grad=0,
                  pts_voxel_layer=None,
                  pts_voxel_encoder=None,
                  pts_middle_encoder=None,
         for img_meta in img_metas:
             img_meta.update(input_shape=input_shape)
+        if self.training and self.stop_prev_grad > 0:
             H, W = input_shape
             img = img.reshape(B, -1, 6, C, H, W)
+            img_grad = img[:, :self.stop_prev_grad]
+            img_nograd = img[:, self.stop_prev_grad:]
             all_img_feats = [self.extract_img_feat(img_grad.reshape(-1, C, H, W))]

val.py CHANGED Viewed

@@ -112,6 +112,7 @@ def main():
     logging.info('Creating model: %s' % cfgs.model.type)
     model = build_model(cfgs.model)
     model.cuda()
     if world_size > 1:
         model = MMDistributedDataParallel(model, [local_rank], broadcast_buffers=False)

     logging.info('Creating model: %s' % cfgs.model.type)
     model = build_model(cfgs.model)
     model.cuda()
+    model.fp16_enabled = True
     if world_size > 1:
         model = MMDistributedDataParallel(model, [local_rank], broadcast_buffers=False)