chore: vendor third_party (remove submodules, ignore artifacts)
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +16 -0
- MaskClustering/third_party/Entity +0 -1
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_baseline.yaml +40 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b0_1x.yaml +43 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b5_1x.yaml +43 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_1x.yaml +40 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_3x.yaml +40 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_dcnv2_3x.yaml +42 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_1x.yaml +40 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_3x.yaml +40 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_1x.yaml +51 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_3x.yaml +50 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_t_1x.yaml +51 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/demo_result_and_vis.py +172 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/__init__.py +5 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/arch.py +298 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/__init__.py +2 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/mixvision.py +464 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/swin.py +723 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/config.py +102 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/__init__.py +0 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/detection.py +112 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/__init__.py +4 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/conv_with_kaiming_uniform.py +52 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/deform_conv.py +111 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/iou_loss.py +54 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/ml_nms.py +26 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/outputs.py +489 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/tower.py +100 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/__init__.py +2 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/comm.py +52 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/measures.py +191 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/__init__.py +0 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/entity_evaluation.py +523 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/__init__.py +2 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/dynamic_mask_head.py +303 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/mask_branch.py +71 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/utils.py +53 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/__init__.py +0 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/deformable_conv_with_off.py +59 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/panopticfcn_head.py +190 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/entity_to_json.py +123 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.py +119 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.sh +8 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/Makefile +9 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/__init__.py +1 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.c +0 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.pyx +308 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/coco.py +453 -0
- MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/cocoeval.py +534 -0
.gitignore
CHANGED
|
@@ -154,3 +154,19 @@ temp/
|
|
| 154 |
**/*.bin
|
| 155 |
data/
|
| 156 |
**/*.pth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
**/*.bin
|
| 155 |
data/
|
| 156 |
**/*.pth
|
| 157 |
+
|
| 158 |
+
# macOS junk
|
| 159 |
+
.DS_Store
|
| 160 |
+
**/.DS_Store
|
| 161 |
+
|
| 162 |
+
# Don't commit build artifacts / compiled binaries from third_party
|
| 163 |
+
MaskClustering/third_party/**/__pycache__/
|
| 164 |
+
MaskClustering/third_party/**/*.pyc
|
| 165 |
+
MaskClustering/third_party/**/*.pyo
|
| 166 |
+
MaskClustering/third_party/**/build/
|
| 167 |
+
MaskClustering/third_party/**/dist/
|
| 168 |
+
MaskClustering/third_party/**/*.o
|
| 169 |
+
MaskClustering/third_party/**/*.so
|
| 170 |
+
|
| 171 |
+
# HF Hub limit: keep large docs assets out of git
|
| 172 |
+
MaskClustering/third_party/Entity/Entityv2/figures/teaser_mosaic_low.png
|
MaskClustering/third_party/Entity
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
Subproject commit 6e7e13ac91ef508088e1b848167c01f19b00b512
|
|
|
|
|
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_baseline.yaml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_resnet_fpn_backbone"
|
| 6 |
+
RESNETS:
|
| 7 |
+
DEPTH: 50
|
| 8 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 9 |
+
WEIGHTS: "pretrained_model/R-50.pkl"
|
| 10 |
+
FPN:
|
| 11 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 12 |
+
FCOS:
|
| 13 |
+
NUM_CLASSES: 1
|
| 14 |
+
CONDINST:
|
| 15 |
+
CLASS_AGNOSTIC: True
|
| 16 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 17 |
+
MASK_BRANCH:
|
| 18 |
+
SEMANTIC_LOSS_ON: False
|
| 19 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 20 |
+
MASK_HEAD:
|
| 21 |
+
CLUSTER_WEIGHT: 0.0
|
| 22 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 23 |
+
DYNAMIC_WEIGHT: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
| 24 |
+
DATASETS:
|
| 25 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 26 |
+
TEST: ("coco_2017_val_entity",)
|
| 27 |
+
SOLVER:
|
| 28 |
+
WARMUP_ITERS: 1500
|
| 29 |
+
IMS_PER_BATCH: 16
|
| 30 |
+
BASE_LR: 0.01
|
| 31 |
+
STEPS: (60000, 80000)
|
| 32 |
+
MAX_ITER: 90000
|
| 33 |
+
CHECKPOINT_PERIOD: 10000
|
| 34 |
+
DATALOADER:
|
| 35 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 36 |
+
INPUT:
|
| 37 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 38 |
+
VERSION: 2
|
| 39 |
+
TEST:
|
| 40 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b0_1x.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_mit_fpn_backbone"
|
| 6 |
+
FREEZE_AT: -1
|
| 7 |
+
MIT_BACKBONE:
|
| 8 |
+
NAME: "b0"
|
| 9 |
+
WEIGHTS: "pretrained_model/mit_b0_trans.pth"
|
| 10 |
+
FPN:
|
| 11 |
+
IN_FEATURES: ["mit1", "mit2", "mit3", "mit4"]
|
| 12 |
+
TOP_LEVELS: 2
|
| 13 |
+
FCOS:
|
| 14 |
+
NUM_CLASSES: 1
|
| 15 |
+
CONDINST:
|
| 16 |
+
CLASS_AGNOSTIC: True
|
| 17 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 18 |
+
MASK_BRANCH:
|
| 19 |
+
SEMANTIC_LOSS_ON: False
|
| 20 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 21 |
+
MASK_HEAD:
|
| 22 |
+
CLUSTER_WEIGHT: 0.5
|
| 23 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 24 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 25 |
+
DATASETS:
|
| 26 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 27 |
+
TEST: ("coco_2017_val_entity",)
|
| 28 |
+
SOLVER:
|
| 29 |
+
OPTIMIZER: "adamw"
|
| 30 |
+
WARMUP_ITERS: 1500
|
| 31 |
+
IMS_PER_BATCH: 16
|
| 32 |
+
BASE_LR: 0.0001
|
| 33 |
+
WEIGHT_DECAY: 0.05
|
| 34 |
+
STEPS: (60000, 80000)
|
| 35 |
+
MAX_ITER: 90000
|
| 36 |
+
CHECKPOINT_PERIOD: 20000
|
| 37 |
+
DATALOADER:
|
| 38 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 39 |
+
INPUT:
|
| 40 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 41 |
+
VERSION: 2
|
| 42 |
+
TEST:
|
| 43 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b5_1x.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_mit_fpn_backbone"
|
| 6 |
+
FREEZE_AT: -1
|
| 7 |
+
MIT_BACKBONE:
|
| 8 |
+
NAME: "b5"
|
| 9 |
+
WEIGHTS: "pretrained_model/mit_b5_trans.pth"
|
| 10 |
+
FPN:
|
| 11 |
+
IN_FEATURES: ["mit1", "mit2", "mit3", "mit4"]
|
| 12 |
+
TOP_LEVELS: 2
|
| 13 |
+
FCOS:
|
| 14 |
+
NUM_CLASSES: 1
|
| 15 |
+
CONDINST:
|
| 16 |
+
CLASS_AGNOSTIC: True
|
| 17 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 80
|
| 18 |
+
MASK_BRANCH:
|
| 19 |
+
SEMANTIC_LOSS_ON: False
|
| 20 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 21 |
+
MASK_HEAD:
|
| 22 |
+
CLUSTER_WEIGHT: 0.5
|
| 23 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 24 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 25 |
+
DATASETS:
|
| 26 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 27 |
+
TEST: ("coco_2017_val_entity",)
|
| 28 |
+
SOLVER:
|
| 29 |
+
OPTIMIZER: "adamw"
|
| 30 |
+
WARMUP_ITERS: 1500
|
| 31 |
+
IMS_PER_BATCH: 8
|
| 32 |
+
BASE_LR: 0.0001
|
| 33 |
+
WEIGHT_DECAY: 0.05
|
| 34 |
+
STEPS: (120000, 160000)
|
| 35 |
+
MAX_ITER: 180000
|
| 36 |
+
CHECKPOINT_PERIOD: 20000
|
| 37 |
+
DATALOADER:
|
| 38 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 39 |
+
INPUT:
|
| 40 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 41 |
+
VERSION: 2
|
| 42 |
+
TEST:
|
| 43 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_1x.yaml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_resnet_fpn_backbone"
|
| 6 |
+
RESNETS:
|
| 7 |
+
DEPTH: 101
|
| 8 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 9 |
+
WEIGHTS: "pretrained_model/R-101.pkl"
|
| 10 |
+
FPN:
|
| 11 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 12 |
+
FCOS:
|
| 13 |
+
NUM_CLASSES: 1
|
| 14 |
+
CONDINST:
|
| 15 |
+
CLASS_AGNOSTIC: True
|
| 16 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 17 |
+
MASK_BRANCH:
|
| 18 |
+
SEMANTIC_LOSS_ON: False
|
| 19 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 20 |
+
MASK_HEAD:
|
| 21 |
+
CLUSTER_WEIGHT: 0.5
|
| 22 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 23 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 24 |
+
DATASETS:
|
| 25 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 26 |
+
TEST: ("coco_2017_val_entity",)
|
| 27 |
+
SOLVER:
|
| 28 |
+
WARMUP_ITERS: 1500
|
| 29 |
+
IMS_PER_BATCH: 16
|
| 30 |
+
BASE_LR: 0.01
|
| 31 |
+
STEPS: (60000, 80000)
|
| 32 |
+
MAX_ITER: 90000
|
| 33 |
+
CHECKPOINT_PERIOD: 20000
|
| 34 |
+
DATALOADER:
|
| 35 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 36 |
+
INPUT:
|
| 37 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 38 |
+
VERSION: 2
|
| 39 |
+
TEST:
|
| 40 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_3x.yaml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_resnet_fpn_backbone"
|
| 6 |
+
RESNETS:
|
| 7 |
+
DEPTH: 101
|
| 8 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 9 |
+
WEIGHTS: "pretrained_model/R-101.pkl"
|
| 10 |
+
FPN:
|
| 11 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 12 |
+
FCOS:
|
| 13 |
+
NUM_CLASSES: 1
|
| 14 |
+
CONDINST:
|
| 15 |
+
CLASS_AGNOSTIC: True
|
| 16 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 17 |
+
MASK_BRANCH:
|
| 18 |
+
SEMANTIC_LOSS_ON: False
|
| 19 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 20 |
+
MASK_HEAD:
|
| 21 |
+
CLUSTER_WEIGHT: 0.5
|
| 22 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 23 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 24 |
+
DATASETS:
|
| 25 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 26 |
+
TEST: ("coco_2017_val_entity",)
|
| 27 |
+
SOLVER:
|
| 28 |
+
WARMUP_ITERS: 1500
|
| 29 |
+
IMS_PER_BATCH: 16
|
| 30 |
+
BASE_LR: 0.01
|
| 31 |
+
STEPS: (180000, 250000)
|
| 32 |
+
MAX_ITER: 270000
|
| 33 |
+
CHECKPOINT_PERIOD: 40000
|
| 34 |
+
DATALOADER:
|
| 35 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 36 |
+
INPUT:
|
| 37 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 38 |
+
VERSION: 2
|
| 39 |
+
TEST:
|
| 40 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_dcnv2_3x.yaml
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_resnet_fpn_backbone"
|
| 6 |
+
RESNETS:
|
| 7 |
+
DEPTH: 101
|
| 8 |
+
DEFORM_ON_PER_STAGE: [False, True, True, True]
|
| 9 |
+
DEFORM_MODULATED: True
|
| 10 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 11 |
+
WEIGHTS: "pretrained_model/R-101.pkl"
|
| 12 |
+
FPN:
|
| 13 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 14 |
+
FCOS:
|
| 15 |
+
NUM_CLASSES: 1
|
| 16 |
+
CONDINST:
|
| 17 |
+
CLASS_AGNOSTIC: True
|
| 18 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 19 |
+
MASK_BRANCH:
|
| 20 |
+
SEMANTIC_LOSS_ON: False
|
| 21 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 22 |
+
MASK_HEAD:
|
| 23 |
+
CLUSTER_WEIGHT: 0.5
|
| 24 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 25 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 26 |
+
DATASETS:
|
| 27 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 28 |
+
TEST: ("coco_2017_val_entity",)
|
| 29 |
+
SOLVER:
|
| 30 |
+
WARMUP_ITERS: 1500
|
| 31 |
+
IMS_PER_BATCH: 16
|
| 32 |
+
BASE_LR: 0.01
|
| 33 |
+
STEPS: (180000, 250000)
|
| 34 |
+
MAX_ITER: 270000
|
| 35 |
+
CHECKPOINT_PERIOD: 40000
|
| 36 |
+
DATALOADER:
|
| 37 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 38 |
+
INPUT:
|
| 39 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 40 |
+
VERSION: 2
|
| 41 |
+
TEST:
|
| 42 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_1x.yaml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_resnet_fpn_backbone"
|
| 6 |
+
RESNETS:
|
| 7 |
+
DEPTH: 50
|
| 8 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 9 |
+
WEIGHTS: "pretrained_model/R-50.pkl"
|
| 10 |
+
FPN:
|
| 11 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 12 |
+
FCOS:
|
| 13 |
+
NUM_CLASSES: 1
|
| 14 |
+
CONDINST:
|
| 15 |
+
CLASS_AGNOSTIC: True
|
| 16 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 17 |
+
MASK_BRANCH:
|
| 18 |
+
SEMANTIC_LOSS_ON: False
|
| 19 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 20 |
+
MASK_HEAD:
|
| 21 |
+
CLUSTER_WEIGHT: 0.5
|
| 22 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 23 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 24 |
+
DATASETS:
|
| 25 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 26 |
+
TEST: ("coco_2017_val_entity",)
|
| 27 |
+
SOLVER:
|
| 28 |
+
WARMUP_ITERS: 1500
|
| 29 |
+
IMS_PER_BATCH: 16
|
| 30 |
+
BASE_LR: 0.01
|
| 31 |
+
STEPS: (60000, 80000)
|
| 32 |
+
MAX_ITER: 90000
|
| 33 |
+
CHECKPOINT_PERIOD: 40000
|
| 34 |
+
DATALOADER:
|
| 35 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 36 |
+
INPUT:
|
| 37 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 38 |
+
VERSION: 2
|
| 39 |
+
TEST:
|
| 40 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_3x.yaml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_resnet_fpn_backbone"
|
| 6 |
+
RESNETS:
|
| 7 |
+
DEPTH: 50
|
| 8 |
+
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 9 |
+
WEIGHTS: "pretrained_model/R-50.pkl"
|
| 10 |
+
FPN:
|
| 11 |
+
IN_FEATURES: ["res2", "res3", "res4", "res5"]
|
| 12 |
+
FCOS:
|
| 13 |
+
NUM_CLASSES: 1
|
| 14 |
+
CONDINST:
|
| 15 |
+
CLASS_AGNOSTIC: True
|
| 16 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 17 |
+
MASK_BRANCH:
|
| 18 |
+
SEMANTIC_LOSS_ON: False
|
| 19 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 20 |
+
MASK_HEAD:
|
| 21 |
+
CLUSTER_WEIGHT: 0.5
|
| 22 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 23 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 24 |
+
DATASETS:
|
| 25 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 26 |
+
TEST: ("coco_2017_val_entity",)
|
| 27 |
+
SOLVER:
|
| 28 |
+
WARMUP_ITERS: 1500
|
| 29 |
+
IMS_PER_BATCH: 16
|
| 30 |
+
BASE_LR: 0.01
|
| 31 |
+
STEPS: (180000, 250000)
|
| 32 |
+
MAX_ITER: 270000
|
| 33 |
+
CHECKPOINT_PERIOD: 40000
|
| 34 |
+
DATALOADER:
|
| 35 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 36 |
+
INPUT:
|
| 37 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 38 |
+
VERSION: 2
|
| 39 |
+
TEST:
|
| 40 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_1x.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_swin_fpn_backbone"
|
| 6 |
+
FREEZE_AT: -1
|
| 7 |
+
SWINT:
|
| 8 |
+
EMBED_DIM: 192
|
| 9 |
+
PATCH_SIZE: 4
|
| 10 |
+
OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
|
| 11 |
+
DEPTHS: [2, 2, 18, 2]
|
| 12 |
+
NUM_HEADS: [6, 12, 24, 48]
|
| 13 |
+
WINDOW_SIZE: 7
|
| 14 |
+
MLP_RATIO: 4
|
| 15 |
+
DROP_PATH_RATE: 0.2
|
| 16 |
+
APE: False
|
| 17 |
+
WEIGHTS: "pretrained_model/swin_large_patch4_window7_224_22k_trans.pth"
|
| 18 |
+
FPN:
|
| 19 |
+
IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
|
| 20 |
+
TOP_LEVELS: 2
|
| 21 |
+
FCOS:
|
| 22 |
+
NUM_CLASSES: 1
|
| 23 |
+
CONDINST:
|
| 24 |
+
CLASS_AGNOSTIC: True
|
| 25 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 26 |
+
MASK_BRANCH:
|
| 27 |
+
SEMANTIC_LOSS_ON: False
|
| 28 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 29 |
+
MASK_HEAD:
|
| 30 |
+
CLUSTER_WEIGHT: 0.5
|
| 31 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 32 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 33 |
+
DATASETS:
|
| 34 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 35 |
+
TEST: ("coco_2017_val_entity",)
|
| 36 |
+
SOLVER:
|
| 37 |
+
OPTIMIZER: "adamw"
|
| 38 |
+
WARMUP_ITERS: 1500
|
| 39 |
+
IMS_PER_BATCH: 16
|
| 40 |
+
BASE_LR: 0.0001
|
| 41 |
+
WEIGHT_DECAY: 0.05
|
| 42 |
+
STEPS: (60000, 80000)
|
| 43 |
+
MAX_ITER: 90000
|
| 44 |
+
CHECKPOINT_PERIOD: 20000
|
| 45 |
+
DATALOADER:
|
| 46 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 47 |
+
INPUT:
|
| 48 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 49 |
+
VERSION: 2
|
| 50 |
+
TEST:
|
| 51 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_3x.yaml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_swin_fpn_backbone"
|
| 6 |
+
FREEZE_AT: -1
|
| 7 |
+
SWINT:
|
| 8 |
+
EMBED_DIM: 192
|
| 9 |
+
PATCH_SIZE: 4
|
| 10 |
+
OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
|
| 11 |
+
DEPTHS: [2, 2, 18, 2]
|
| 12 |
+
NUM_HEADS: [6, 12, 24, 48]
|
| 13 |
+
WINDOW_SIZE: 7
|
| 14 |
+
MLP_RATIO: 4
|
| 15 |
+
DROP_PATH_RATE: 0.2
|
| 16 |
+
APE: False
|
| 17 |
+
WEIGHTS: "pretrained_model/swin_large_patch4_window7_224_22k_trans.pth"
|
| 18 |
+
FPN:
|
| 19 |
+
IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
|
| 20 |
+
TOP_LEVELS: 2
|
| 21 |
+
FCOS:
|
| 22 |
+
NUM_CLASSES: 1
|
| 23 |
+
CONDINST:
|
| 24 |
+
CLASS_AGNOSTIC: True
|
| 25 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 26 |
+
MASK_BRANCH:
|
| 27 |
+
SEMANTIC_LOSS_ON: False
|
| 28 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 29 |
+
MASK_HEAD:
|
| 30 |
+
CLUSTER_WEIGHT: 0.5
|
| 31 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 32 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 33 |
+
DATASETS:
|
| 34 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 35 |
+
TEST: ("coco_2017_val_entity",)
|
| 36 |
+
SOLVER:
|
| 37 |
+
WARMUP_ITERS: 1500
|
| 38 |
+
IMS_PER_BATCH: 16
|
| 39 |
+
BASE_LR: 0.0001
|
| 40 |
+
WEIGHT_DECAY: 0.05
|
| 41 |
+
STEPS: (180000, 250000)
|
| 42 |
+
MAX_ITER: 270000
|
| 43 |
+
CHECKPOINT_PERIOD: 40000
|
| 44 |
+
DATALOADER:
|
| 45 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 46 |
+
INPUT:
|
| 47 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 48 |
+
VERSION: 2
|
| 49 |
+
TEST:
|
| 50 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_t_1x.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MODEL:
|
| 2 |
+
META_ARCHITECTURE: "EntityFPN"
|
| 3 |
+
MASK_ON: False
|
| 4 |
+
BACKBONE:
|
| 5 |
+
NAME: "build_retinanet_swin_fpn_backbone"
|
| 6 |
+
FREEZE_AT: -1
|
| 7 |
+
SWINT:
|
| 8 |
+
EMBED_DIM: 96
|
| 9 |
+
PATCH_SIZE: 4
|
| 10 |
+
OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
|
| 11 |
+
DEPTHS: [2, 2, 6, 2]
|
| 12 |
+
NUM_HEADS: [3, 6, 12, 24]
|
| 13 |
+
WINDOW_SIZE: 7
|
| 14 |
+
MLP_RATIO: 4
|
| 15 |
+
DROP_PATH_RATE: 0.2
|
| 16 |
+
APE: False
|
| 17 |
+
WEIGHTS: "pretrained_model/swin_tiny_patch4_window7_224_trans.pth"
|
| 18 |
+
FPN:
|
| 19 |
+
IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
|
| 20 |
+
TOP_LEVELS: 2
|
| 21 |
+
FCOS:
|
| 22 |
+
NUM_CLASSES: 1
|
| 23 |
+
CONDINST:
|
| 24 |
+
CLASS_AGNOSTIC: True
|
| 25 |
+
TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
|
| 26 |
+
MASK_BRANCH:
|
| 27 |
+
SEMANTIC_LOSS_ON: False
|
| 28 |
+
IN_FEATURES: ["p3", "p4", "p5"]
|
| 29 |
+
MASK_HEAD:
|
| 30 |
+
CLUSTER_WEIGHT: 0.5
|
| 31 |
+
DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
|
| 32 |
+
DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
|
| 33 |
+
DATASETS:
|
| 34 |
+
TRAIN: ("coco_2017_train_entity",)
|
| 35 |
+
TEST: ("coco_2017_val_entity",)
|
| 36 |
+
SOLVER:
|
| 37 |
+
OPTIMIZER: "adamw"
|
| 38 |
+
WARMUP_ITERS: 1500
|
| 39 |
+
IMS_PER_BATCH: 16
|
| 40 |
+
BASE_LR: 0.0001
|
| 41 |
+
WEIGHT_DECAY: 0.05
|
| 42 |
+
STEPS: (60000, 80000)
|
| 43 |
+
MAX_ITER: 90000
|
| 44 |
+
CHECKPOINT_PERIOD: 20000
|
| 45 |
+
DATALOADER:
|
| 46 |
+
FILTER_EMPTY_ANNOTATIONS: True
|
| 47 |
+
INPUT:
|
| 48 |
+
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
|
| 49 |
+
VERSION: 2
|
| 50 |
+
TEST:
|
| 51 |
+
CLASS_AGNOSTIC: True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/demo_result_and_vis.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
import argparse
|
| 3 |
+
import glob
|
| 4 |
+
import multiprocessing as mp
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
import cv2
|
| 8 |
+
import tqdm
|
| 9 |
+
import numpy as np
|
| 10 |
+
import copy
|
| 11 |
+
|
| 12 |
+
from detectron2.config import get_cfg
|
| 13 |
+
from detectron2.data.detection_utils import read_image
|
| 14 |
+
from detectron2.utils.logger import setup_logger
|
| 15 |
+
from detectron2.engine import default_setup
|
| 16 |
+
|
| 17 |
+
from entityseg import *
|
| 18 |
+
|
| 19 |
+
from predictor import VisualizationDemo
|
| 20 |
+
import pdb
|
| 21 |
+
|
| 22 |
+
# constants
|
| 23 |
+
WINDOW_NAME = "Image Segmentation"
|
| 24 |
+
|
| 25 |
+
def make_colors():
|
| 26 |
+
from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
|
| 27 |
+
colors = []
|
| 28 |
+
for cate in COCO_CATEGORIES:
|
| 29 |
+
colors.append(cate["color"])
|
| 30 |
+
return colors
|
| 31 |
+
|
| 32 |
+
def mask_to_boundary(mask, dilation_ratio=0.0008):
|
| 33 |
+
"""
|
| 34 |
+
Convert binary mask to boundary mask.
|
| 35 |
+
:param mask (numpy array, uint8): binary mask
|
| 36 |
+
:param dilation_ratio (float): ratio to calculate dilation = dilation_ratio * image_diagonal
|
| 37 |
+
:return: boundary mask (numpy array)
|
| 38 |
+
"""
|
| 39 |
+
h, w = mask.shape
|
| 40 |
+
img_diag = np.sqrt(h ** 2 + w ** 2)
|
| 41 |
+
dilation = int(round(dilation_ratio * img_diag))
|
| 42 |
+
if dilation < 1:
|
| 43 |
+
dilation = 1
|
| 44 |
+
# Pad image so mask truncated by the image border is also considered as boundary.
|
| 45 |
+
new_mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=0)
|
| 46 |
+
kernel = np.ones((3, 3), dtype=np.uint8)
|
| 47 |
+
new_mask_erode = cv2.erode(new_mask, kernel, iterations=dilation)
|
| 48 |
+
mask_erode = new_mask_erode[1 : h + 1, 1 : w + 1]
|
| 49 |
+
# G_d intersects G in the paper.
|
| 50 |
+
return mask - mask_erode
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def setup_cfg(args):
|
| 54 |
+
# load config from file and command-line arguments
|
| 55 |
+
cfg = get_cfg()
|
| 56 |
+
add_entity_config(cfg)
|
| 57 |
+
cfg.merge_from_file(args.config_file)
|
| 58 |
+
cfg.merge_from_list(args.opts)
|
| 59 |
+
default_setup(cfg, args)
|
| 60 |
+
cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
|
| 61 |
+
cfg.freeze()
|
| 62 |
+
return cfg
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def get_parser():
|
| 66 |
+
parser = argparse.ArgumentParser(description="Detectron2 demo for builtin models")
|
| 67 |
+
parser.add_argument(
|
| 68 |
+
"--config-file",
|
| 69 |
+
default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
|
| 70 |
+
metavar="FILE",
|
| 71 |
+
help="path to config file",
|
| 72 |
+
)
|
| 73 |
+
parser.add_argument(
|
| 74 |
+
"--input",
|
| 75 |
+
nargs="+",
|
| 76 |
+
help="A list of space separated input images; "
|
| 77 |
+
"or a single glob pattern such as 'directory/*.jpg'",
|
| 78 |
+
)
|
| 79 |
+
parser.add_argument(
|
| 80 |
+
"--output",
|
| 81 |
+
help="A file or directory to save output visualizations. "
|
| 82 |
+
"If not given, will show output in an OpenCV window.",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
parser.add_argument(
|
| 86 |
+
"--confidence-threshold",
|
| 87 |
+
type=float,
|
| 88 |
+
default=0.2,
|
| 89 |
+
help="Minimum score for instance predictions to be shown",
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
parser.add_argument(
|
| 93 |
+
"opts",
|
| 94 |
+
help="Modify config options by adding 'KEY VALUE' pairs at the end of the command. "
|
| 95 |
+
"See config references at "
|
| 96 |
+
"https://detectron2.readthedocs.io/modules/config.html#config-references",
|
| 97 |
+
default=None,
|
| 98 |
+
nargs=argparse.REMAINDER,
|
| 99 |
+
)
|
| 100 |
+
return parser
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
if __name__ == "__main__":
|
| 104 |
+
mp.set_start_method("spawn", force=True)
|
| 105 |
+
args = get_parser().parse_args()
|
| 106 |
+
setup_logger(name="fvcore")
|
| 107 |
+
logger = setup_logger()
|
| 108 |
+
logger.info("Arguments: " + str(args))
|
| 109 |
+
|
| 110 |
+
if not os.path.exists(args.output):
|
| 111 |
+
os.makedirs(args.output)
|
| 112 |
+
|
| 113 |
+
cfg = setup_cfg(args)
|
| 114 |
+
|
| 115 |
+
demo = VisualizationDemo(cfg)
|
| 116 |
+
colors = make_colors()
|
| 117 |
+
|
| 118 |
+
if args.input:
|
| 119 |
+
if len(args.input) == 1:
|
| 120 |
+
args.input = glob.glob(os.path.expanduser(args.input[0]))
|
| 121 |
+
assert args.input, "The input path(s) was not found"
|
| 122 |
+
for path in tqdm.tqdm(args.input, disable=not args.output):
|
| 123 |
+
# use PIL, to be consistent with evaluation
|
| 124 |
+
img = read_image(path, format="BGR")
|
| 125 |
+
start_time = time.time()
|
| 126 |
+
data = demo.run_on_image_wo_vis(img)
|
| 127 |
+
logger.info(
|
| 128 |
+
"{}: {} in {:.2f}s".format(
|
| 129 |
+
path,
|
| 130 |
+
"detected {} instances".format(len(data[0])),
|
| 131 |
+
time.time() - start_time,
|
| 132 |
+
)
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
if os.path.isdir(args.output):
|
| 136 |
+
assert os.path.isdir(args.output), args.output
|
| 137 |
+
out_filename = os.path.join(args.output, os.path.basename(path))
|
| 138 |
+
else:
|
| 139 |
+
assert len(args.input) == 1, "Please specify a directory with args.output"
|
| 140 |
+
out_filename = args.output
|
| 141 |
+
## save inference result, [0] original score by detection head, [1] mask rescoring score, [2] mask_id
|
| 142 |
+
ori_scores = data[0]
|
| 143 |
+
scores = data[1]
|
| 144 |
+
mask_id = data[2]
|
| 145 |
+
np.savez(out_filename.split(".")[0]+".npz", ori_scores=ori_scores, scores=scores, mask_id=mask_id)
|
| 146 |
+
|
| 147 |
+
## save visualization
|
| 148 |
+
img_for_paste = copy.deepcopy(img)
|
| 149 |
+
color_mask = copy.deepcopy(img)
|
| 150 |
+
masks_edge = np.zeros(img.shape[:2], dtype=np.uint8)
|
| 151 |
+
alpha = 0.4
|
| 152 |
+
count = 0
|
| 153 |
+
for index, score in enumerate(scores):
|
| 154 |
+
if score <= args.confidence_threshold:
|
| 155 |
+
break
|
| 156 |
+
color_mask[mask_id==count] = colors[count]
|
| 157 |
+
boundary = mask_to_boundary((mask_id==count).astype(np.uint8))
|
| 158 |
+
masks_edge[boundary>0] = 1
|
| 159 |
+
count += 1
|
| 160 |
+
img_wm = cv2.addWeighted(img_for_paste, alpha, color_mask, 1-alpha, 0)
|
| 161 |
+
img_wm[masks_edge==1] = 0
|
| 162 |
+
fvis = np.concatenate((img, img_wm))
|
| 163 |
+
cv2.imwrite(out_filename.split(".")[0]+".jpg",fvis)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .arch import EntityFPN
|
| 2 |
+
from .data import *
|
| 3 |
+
from .config import add_entity_config
|
| 4 |
+
from .evaluator.entity_evaluation import COCOEvaluator_ClassAgnostic
|
| 5 |
+
from .backbone import build_retinanet_swin_fpn_backbone, build_retinanet_mit_fpn_backbone
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/arch.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
import logging
|
| 3 |
+
import torch
|
| 4 |
+
from torch import nn
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
|
| 7 |
+
from detectron2.structures import ImageList
|
| 8 |
+
from detectron2.modeling.backbone import build_backbone
|
| 9 |
+
from detectron2.modeling.postprocessing import detector_postprocess, sem_seg_postprocess
|
| 10 |
+
from detectron2.modeling.proposal_generator import build_proposal_generator
|
| 11 |
+
from detectron2.modeling.roi_heads import build_roi_heads
|
| 12 |
+
from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
|
| 13 |
+
|
| 14 |
+
from .det_head.detection import build_det_head
|
| 15 |
+
from .det_head.utils.comm import aligned_bilinear
|
| 16 |
+
|
| 17 |
+
from .mask_head.dynamic_mask_head import build_dynamic_mask_head
|
| 18 |
+
from .mask_head.mask_branch import build_mask_branch
|
| 19 |
+
|
| 20 |
+
from .panopticfcn_tools.panopticfcn_head import build_kernel_head
|
| 21 |
+
|
| 22 |
+
from detectron2.structures import Instances, Boxes
|
| 23 |
+
import random
|
| 24 |
+
import pdb
|
| 25 |
+
import copy
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
__all__ = ["ItemFPN"]
|
| 29 |
+
@META_ARCH_REGISTRY.register()
|
| 30 |
+
class EntityFPN(nn.Module):
|
| 31 |
+
"""
|
| 32 |
+
Implement the paper :paper:`PanopticFPN`.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(self, cfg):
|
| 36 |
+
super().__init__()
|
| 37 |
+
self.device = torch.device(cfg.MODEL.DEVICE)
|
| 38 |
+
|
| 39 |
+
self.backbone = build_backbone(cfg)
|
| 40 |
+
backbone_shape = self.backbone.output_shape()
|
| 41 |
+
self.det_head = build_det_head(cfg, backbone_shape)
|
| 42 |
+
|
| 43 |
+
## mask
|
| 44 |
+
self.mask_head = build_dynamic_mask_head(cfg)
|
| 45 |
+
self.mask_branch = build_mask_branch(cfg, self.backbone.output_shape())
|
| 46 |
+
self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
|
| 47 |
+
self.max_proposals = cfg.MODEL.CONDINST.MAX_PROPOSALS
|
| 48 |
+
self.only_class_agnostic = cfg.MODEL.CONDINST.CLASS_AGNOSTIC
|
| 49 |
+
|
| 50 |
+
in_channels = self.det_head.in_channels_to_top_module
|
| 51 |
+
|
| 52 |
+
self.controller = build_kernel_head(cfg, self.mask_head.num_gen_params)
|
| 53 |
+
self.train_max_proposals_per_image = cfg.MODEL.CONDINST.TRAIN_MAX_PROPOSALS_PER_IMAGE
|
| 54 |
+
|
| 55 |
+
self.use_mask_rescore_infer = cfg.MODEL.CONDINST.MASK_BRANCH.USE_MASK_RESCORE
|
| 56 |
+
|
| 57 |
+
pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
|
| 58 |
+
pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
|
| 59 |
+
self.normalizer = lambda x: (x - pixel_mean) / pixel_std
|
| 60 |
+
|
| 61 |
+
self.pixel_mean = pixel_mean
|
| 62 |
+
self.pixel_std = pixel_std
|
| 63 |
+
self.to(self.device)
|
| 64 |
+
|
| 65 |
+
def forward(self, batched_inputs):
|
| 66 |
+
"""
|
| 67 |
+
Args:
|
| 68 |
+
batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
|
| 69 |
+
Each item in the list contains the inputs for one image.
|
| 70 |
+
|
| 71 |
+
For now, each item in the list is a dict that contains:
|
| 72 |
+
|
| 73 |
+
* "image": Tensor, image in (C, H, W) format.
|
| 74 |
+
* "instances": Instances
|
| 75 |
+
* "sem_seg": semantic segmentation ground truth.
|
| 76 |
+
* Other information that's included in the original dicts, such as:
|
| 77 |
+
"height", "width" (int): the output resolution of the model, used in inference.
|
| 78 |
+
See :meth:`postprocess` for details.
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
list[dict]:
|
| 82 |
+
each dict is the results for one image. The dict contains the following keys:
|
| 83 |
+
|
| 84 |
+
* "instances": see :meth:`GeneralizedRCNN.forward` for its format.
|
| 85 |
+
* "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
|
| 86 |
+
* "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
|
| 87 |
+
See the return value of
|
| 88 |
+
:func:`combine_semantic_and_instance_outputs` for its format.
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
# for x in batched_inputs:
|
| 92 |
+
# print(x["file_name"])
|
| 93 |
+
images = self.preprocess_image(batched_inputs)
|
| 94 |
+
features = self.backbone(images.tensor)
|
| 95 |
+
|
| 96 |
+
if "instances" in batched_inputs[0] and self.training:
|
| 97 |
+
B = len(batched_inputs)
|
| 98 |
+
for i in range(B):
|
| 99 |
+
if self.only_class_agnostic:
|
| 100 |
+
batched_inputs[i]["instances"].gt_classes[:] = 0
|
| 101 |
+
|
| 102 |
+
instance_map = batched_inputs[i]["instance_map"]
|
| 103 |
+
num_instances = int(torch.max(instance_map)+1)
|
| 104 |
+
instanceid = batched_inputs[i]["instances"].instanceid
|
| 105 |
+
gt_bitmasks_pad = F.one_hot(instance_map.long(), num_instances)[...,instanceid].permute((2,0,1))
|
| 106 |
+
|
| 107 |
+
pad_h, pad_w = images.tensor.size(-2), images.tensor.size(-1)
|
| 108 |
+
no_pad_h, no_pad_w = gt_bitmasks_pad.shape[1:]
|
| 109 |
+
|
| 110 |
+
padding_size = [0, pad_w - no_pad_w, 0, pad_h-no_pad_h]
|
| 111 |
+
gt_bitmasks_pad = F.pad(gt_bitmasks_pad, padding_size, value=0)
|
| 112 |
+
|
| 113 |
+
start = int(self.mask_out_stride // 2)
|
| 114 |
+
bitmask_full = gt_bitmasks_pad.clone()
|
| 115 |
+
bitmask = gt_bitmasks_pad[:,start::self.mask_out_stride, start::self.mask_out_stride]
|
| 116 |
+
|
| 117 |
+
N = bitmask.shape[0]
|
| 118 |
+
batched_inputs[i]["instances"].gt_bitmasks = bitmask.int()
|
| 119 |
+
batched_inputs[i]["instances"].gt_bitmasks_full = bitmask_full.int()
|
| 120 |
+
|
| 121 |
+
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
|
| 122 |
+
else:
|
| 123 |
+
gt_instances = None
|
| 124 |
+
|
| 125 |
+
mask_feats = self.mask_branch(features, gt_instances)
|
| 126 |
+
proposals, proposal_losses = self.det_head(images, features, gt_instances, self.controller)
|
| 127 |
+
|
| 128 |
+
if self.training:
|
| 129 |
+
max_num_proposals = self.train_max_proposals_per_image * len(batched_inputs)
|
| 130 |
+
actual_num_proposals = len(proposals["instances"])
|
| 131 |
+
if actual_num_proposals >= max_num_proposals:
|
| 132 |
+
select = random.sample(list(range(actual_num_proposals)), max_num_proposals)
|
| 133 |
+
proposals["instances"] = proposals["instances"][select]
|
| 134 |
+
|
| 135 |
+
loss_masks = self._forward_mask_heads_train(proposals, mask_feats, gt_instances)
|
| 136 |
+
losses = {}
|
| 137 |
+
losses.update(proposal_losses)
|
| 138 |
+
losses.update(loss_masks)
|
| 139 |
+
return losses
|
| 140 |
+
else:
|
| 141 |
+
pred_instances_w_masks = self._forward_mask_heads_test(proposals, mask_feats)
|
| 142 |
+
padded_im_h, padded_im_w = images.tensor.size()[-2:]
|
| 143 |
+
processed_results = []
|
| 144 |
+
for im_id, (input_per_image, image_size) in enumerate(zip(batched_inputs, images.image_sizes)):
|
| 145 |
+
height = input_per_image.get("height", image_size[0])
|
| 146 |
+
width = input_per_image.get("width", image_size[1])
|
| 147 |
+
|
| 148 |
+
instances_per_im = pred_instances_w_masks[pred_instances_w_masks.im_inds == im_id]
|
| 149 |
+
instances_per_im = self.postprocess(
|
| 150 |
+
instances_per_im, height, width,
|
| 151 |
+
padded_im_h, padded_im_w
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
processed_results.append({
|
| 155 |
+
"instances": instances_per_im
|
| 156 |
+
})
|
| 157 |
+
|
| 158 |
+
return processed_results
|
| 159 |
+
|
| 160 |
+
def _forward_mask_heads_train(self, proposals, mask_feats, gt_instances):
|
| 161 |
+
# prepare the inputs for mask heads
|
| 162 |
+
pred_instances = proposals["instances"]
|
| 163 |
+
|
| 164 |
+
if 0 <= self.max_proposals < len(pred_instances):
|
| 165 |
+
inds = torch.randperm(len(pred_instances), device=mask_feats.device).long()
|
| 166 |
+
logger.info("clipping proposals from {} to {}".format(
|
| 167 |
+
len(pred_instances), self.max_proposals
|
| 168 |
+
))
|
| 169 |
+
pred_instances = pred_instances[inds[:self.max_proposals]]
|
| 170 |
+
|
| 171 |
+
pred_instances.mask_head_params = pred_instances.top_feats
|
| 172 |
+
|
| 173 |
+
loss_masks = self.mask_head(
|
| 174 |
+
mask_feats, self.mask_branch.out_stride,
|
| 175 |
+
pred_instances, gt_instances
|
| 176 |
+
)
|
| 177 |
+
return loss_masks
|
| 178 |
+
|
| 179 |
+
def _forward_mask_heads_test(self, proposals, mask_feats):
|
| 180 |
+
# prepare the inputs for mask heads
|
| 181 |
+
for im_id, per_im in enumerate(proposals):
|
| 182 |
+
per_im.im_inds = per_im.locations.new_ones(len(per_im), dtype=torch.long) * im_id
|
| 183 |
+
pred_instances = Instances.cat(proposals)
|
| 184 |
+
pred_instances.mask_head_params = pred_instances.top_feat
|
| 185 |
+
|
| 186 |
+
pred_instances_w_masks = self.mask_head(mask_feats, self.mask_branch.out_stride, pred_instances)
|
| 187 |
+
|
| 188 |
+
return pred_instances_w_masks
|
| 189 |
+
|
| 190 |
+
def preprocess_image(self, batched_inputs):
|
| 191 |
+
"""
|
| 192 |
+
Normalize, pad and batch the input images.
|
| 193 |
+
"""
|
| 194 |
+
images = [x["image"].to(self.device) for x in batched_inputs]
|
| 195 |
+
images = [self.normalizer(x) for x in images]
|
| 196 |
+
images = ImageList.from_tensors(images, self.backbone.size_divisibility)
|
| 197 |
+
return images
|
| 198 |
+
|
| 199 |
+
def postprocess(self, results, output_height, output_width, padded_im_h, padded_im_w, mask_threshold=0.5):
|
| 200 |
+
"""
|
| 201 |
+
Resize the output instances.
|
| 202 |
+
The input images are often resized when entering an object detector.
|
| 203 |
+
As a result, we often need the outputs of the detector in a different
|
| 204 |
+
resolution from its inputs.
|
| 205 |
+
This function will resize the raw outputs of an R-CNN detector
|
| 206 |
+
to produce outputs according to the desired output resolution.
|
| 207 |
+
Args:
|
| 208 |
+
results (Instances): the raw outputs from the detector.
|
| 209 |
+
`results.image_size` contains the input image resolution the detector sees.
|
| 210 |
+
This object might be modified in-place.
|
| 211 |
+
output_height, output_width: the desired output resolution.
|
| 212 |
+
Returns:
|
| 213 |
+
Instances: the resized output from the model, based on the output resolution
|
| 214 |
+
"""
|
| 215 |
+
scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
|
| 216 |
+
resized_im_h, resized_im_w = results.image_size
|
| 217 |
+
results = Instances((output_height, output_width), **results.get_fields())
|
| 218 |
+
|
| 219 |
+
if results.has("pred_boxes"):
|
| 220 |
+
output_boxes = results.pred_boxes
|
| 221 |
+
elif results.has("proposal_boxes"):
|
| 222 |
+
output_boxes = results.proposal_boxes
|
| 223 |
+
|
| 224 |
+
output_boxes.scale(scale_x, scale_y)
|
| 225 |
+
output_boxes.clip(results.image_size)
|
| 226 |
+
results = results[output_boxes.nonempty()]
|
| 227 |
+
|
| 228 |
+
if results.has("pred_global_masks"):
|
| 229 |
+
mask_h, mask_w = results.pred_global_masks.size()[-2:]
|
| 230 |
+
factor_h = padded_im_h // mask_h
|
| 231 |
+
factor_w = padded_im_w // mask_w
|
| 232 |
+
assert factor_h == factor_w
|
| 233 |
+
factor = factor_h
|
| 234 |
+
pred_global_masks = aligned_bilinear(
|
| 235 |
+
results.pred_global_masks, factor
|
| 236 |
+
)
|
| 237 |
+
pred_global_masks = pred_global_masks[:, :, :resized_im_h, :resized_im_w]
|
| 238 |
+
pred_global_masks = F.interpolate(
|
| 239 |
+
pred_global_masks,
|
| 240 |
+
size=(output_height, output_width),
|
| 241 |
+
mode="bilinear", align_corners=False
|
| 242 |
+
)
|
| 243 |
+
pred_global_masks = pred_global_masks[:, 0, :, :]
|
| 244 |
+
results.pred_masks = (pred_global_masks > mask_threshold).float()
|
| 245 |
+
results.pred_masks_score = pred_global_masks
|
| 246 |
+
|
| 247 |
+
# from high score to low score
|
| 248 |
+
origin_masks = results.pred_masks
|
| 249 |
+
num_instances, H, W = origin_masks.shape
|
| 250 |
+
filter_masks = []
|
| 251 |
+
|
| 252 |
+
# initialize background
|
| 253 |
+
mask_0 = torch.zeros((H, W)).cuda() + 0.001
|
| 254 |
+
filter_masks.insert(0, mask_0)
|
| 255 |
+
score = 0.002
|
| 256 |
+
for index in range(num_instances):
|
| 257 |
+
mask = origin_masks[num_instances-index-1]
|
| 258 |
+
mask[mask==1] = score
|
| 259 |
+
filter_masks.insert(0, mask)
|
| 260 |
+
score = score + 0.001
|
| 261 |
+
|
| 262 |
+
filter_masks = torch.stack(filter_masks, dim=0)
|
| 263 |
+
_, instance_ids = torch.max(filter_masks, dim=0)
|
| 264 |
+
unique_instance_ids = torch.unique(instance_ids)
|
| 265 |
+
|
| 266 |
+
ori_scores = results.scores.clone()
|
| 267 |
+
has_mask_valid = []
|
| 268 |
+
for instance_id in unique_instance_ids:
|
| 269 |
+
if instance_id == num_instances:
|
| 270 |
+
continue
|
| 271 |
+
mask = (instance_ids==instance_id).float()
|
| 272 |
+
finds_y, finds_x = torch.nonzero(mask==1, as_tuple=True)
|
| 273 |
+
if len(finds_y) == 0:
|
| 274 |
+
continue
|
| 275 |
+
x1 = torch.min(finds_x)
|
| 276 |
+
x2 = torch.max(finds_x)
|
| 277 |
+
y1 = torch.min(finds_y)
|
| 278 |
+
y2 = torch.max(finds_y)
|
| 279 |
+
|
| 280 |
+
if x2-x1==0 or y2-y1==0:
|
| 281 |
+
continue
|
| 282 |
+
has_mask_valid.append(int(instance_id))
|
| 283 |
+
|
| 284 |
+
## mask rescoring would obtain higher performance
|
| 285 |
+
if self.use_mask_rescore_infer:
|
| 286 |
+
mask_score = results.pred_masks_score[instance_id]
|
| 287 |
+
seg_scores = (mask_score * mask).sum() / mask.sum()
|
| 288 |
+
results.scores[instance_id] = results.scores[instance_id] * seg_scores
|
| 289 |
+
|
| 290 |
+
results.pred_masks[instance_id] = mask
|
| 291 |
+
results.pred_boxes.tensor[instance_id][0] = x1
|
| 292 |
+
results.pred_boxes.tensor[instance_id][1] = y1
|
| 293 |
+
results.pred_boxes.tensor[instance_id][2] = x2
|
| 294 |
+
results.pred_boxes.tensor[instance_id][3] = y2
|
| 295 |
+
|
| 296 |
+
results.ori_scores = ori_scores
|
| 297 |
+
results = results[has_mask_valid]
|
| 298 |
+
return results
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .swin import build_retinanet_swin_fpn_backbone
|
| 2 |
+
from .mixvision import build_retinanet_mit_fpn_backbone
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/mixvision.py
ADDED
|
@@ -0,0 +1,464 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from functools import partial
|
| 5 |
+
|
| 6 |
+
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
|
| 7 |
+
from timm.models.registry import register_model
|
| 8 |
+
from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool, LastLevelP6P7
|
| 9 |
+
import math
|
| 10 |
+
|
| 11 |
+
from detectron2.layers import ShapeSpec
|
| 12 |
+
from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
|
| 13 |
+
|
| 14 |
+
class Mlp(nn.Module):
|
| 15 |
+
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
|
| 16 |
+
super().__init__()
|
| 17 |
+
out_features = out_features or in_features
|
| 18 |
+
hidden_features = hidden_features or in_features
|
| 19 |
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
| 20 |
+
self.dwconv = DWConv(hidden_features)
|
| 21 |
+
self.act = act_layer()
|
| 22 |
+
self.fc2 = nn.Linear(hidden_features, out_features)
|
| 23 |
+
self.drop = nn.Dropout(drop)
|
| 24 |
+
|
| 25 |
+
self.apply(self._init_weights)
|
| 26 |
+
|
| 27 |
+
def _init_weights(self, m):
|
| 28 |
+
if isinstance(m, nn.Linear):
|
| 29 |
+
trunc_normal_(m.weight, std=.02)
|
| 30 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
| 31 |
+
nn.init.constant_(m.bias, 0)
|
| 32 |
+
elif isinstance(m, nn.LayerNorm):
|
| 33 |
+
nn.init.constant_(m.bias, 0)
|
| 34 |
+
nn.init.constant_(m.weight, 1.0)
|
| 35 |
+
elif isinstance(m, nn.Conv2d):
|
| 36 |
+
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
| 37 |
+
fan_out //= m.groups
|
| 38 |
+
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
|
| 39 |
+
if m.bias is not None:
|
| 40 |
+
m.bias.data.zero_()
|
| 41 |
+
|
| 42 |
+
def forward(self, x, H, W):
|
| 43 |
+
x = self.fc1(x)
|
| 44 |
+
x = self.dwconv(x, H, W)
|
| 45 |
+
x = self.act(x)
|
| 46 |
+
x = self.drop(x)
|
| 47 |
+
x = self.fc2(x)
|
| 48 |
+
x = self.drop(x)
|
| 49 |
+
return x
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class Attention(nn.Module):
|
| 53 |
+
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
|
| 54 |
+
super().__init__()
|
| 55 |
+
assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
|
| 56 |
+
|
| 57 |
+
self.dim = dim
|
| 58 |
+
self.num_heads = num_heads
|
| 59 |
+
head_dim = dim // num_heads
|
| 60 |
+
self.scale = qk_scale or head_dim ** -0.5
|
| 61 |
+
|
| 62 |
+
self.q = nn.Linear(dim, dim, bias=qkv_bias)
|
| 63 |
+
self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
|
| 64 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
| 65 |
+
self.proj = nn.Linear(dim, dim)
|
| 66 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
| 67 |
+
|
| 68 |
+
self.sr_ratio = sr_ratio
|
| 69 |
+
if sr_ratio > 1:
|
| 70 |
+
self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
|
| 71 |
+
self.norm = nn.LayerNorm(dim)
|
| 72 |
+
|
| 73 |
+
self.apply(self._init_weights)
|
| 74 |
+
|
| 75 |
+
def _init_weights(self, m):
|
| 76 |
+
if isinstance(m, nn.Linear):
|
| 77 |
+
trunc_normal_(m.weight, std=.02)
|
| 78 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
| 79 |
+
nn.init.constant_(m.bias, 0)
|
| 80 |
+
elif isinstance(m, nn.LayerNorm):
|
| 81 |
+
nn.init.constant_(m.bias, 0)
|
| 82 |
+
nn.init.constant_(m.weight, 1.0)
|
| 83 |
+
elif isinstance(m, nn.Conv2d):
|
| 84 |
+
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
| 85 |
+
fan_out //= m.groups
|
| 86 |
+
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
|
| 87 |
+
if m.bias is not None:
|
| 88 |
+
m.bias.data.zero_()
|
| 89 |
+
|
| 90 |
+
def forward(self, x, H, W):
|
| 91 |
+
B, N, C = x.shape
|
| 92 |
+
q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
|
| 93 |
+
|
| 94 |
+
if self.sr_ratio > 1:
|
| 95 |
+
x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
|
| 96 |
+
x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
|
| 97 |
+
x_ = self.norm(x_)
|
| 98 |
+
kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 99 |
+
else:
|
| 100 |
+
kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 101 |
+
k, v = kv[0], kv[1]
|
| 102 |
+
|
| 103 |
+
attn = (q @ k.transpose(-2, -1)) * self.scale
|
| 104 |
+
attn = attn.softmax(dim=-1)
|
| 105 |
+
attn = self.attn_drop(attn)
|
| 106 |
+
|
| 107 |
+
x = (attn @ v).transpose(1, 2).contiguous().reshape(B, N, C)
|
| 108 |
+
x = self.proj(x)
|
| 109 |
+
x = self.proj_drop(x)
|
| 110 |
+
return x
|
| 111 |
+
|
| 112 |
+
class Block(nn.Module):
|
| 113 |
+
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
|
| 114 |
+
drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
|
| 115 |
+
super().__init__()
|
| 116 |
+
self.norm1 = norm_layer(dim)
|
| 117 |
+
self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
|
| 118 |
+
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
|
| 119 |
+
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
| 120 |
+
self.norm2 = norm_layer(dim)
|
| 121 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
| 122 |
+
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
|
| 123 |
+
|
| 124 |
+
self.apply(self._init_weights)
|
| 125 |
+
|
| 126 |
+
def _init_weights(self, m):
|
| 127 |
+
if isinstance(m, nn.Linear):
|
| 128 |
+
trunc_normal_(m.weight, std=.02)
|
| 129 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
| 130 |
+
nn.init.constant_(m.bias, 0)
|
| 131 |
+
elif isinstance(m, nn.LayerNorm):
|
| 132 |
+
nn.init.constant_(m.bias, 0)
|
| 133 |
+
nn.init.constant_(m.weight, 1.0)
|
| 134 |
+
elif isinstance(m, nn.Conv2d):
|
| 135 |
+
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
| 136 |
+
fan_out //= m.groups
|
| 137 |
+
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
|
| 138 |
+
if m.bias is not None:
|
| 139 |
+
m.bias.data.zero_()
|
| 140 |
+
|
| 141 |
+
def forward(self, x, H, W):
|
| 142 |
+
x = x + self.drop_path(self.attn(self.norm1(x), H, W))
|
| 143 |
+
x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
|
| 144 |
+
|
| 145 |
+
return x
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class OverlapPatchEmbed(nn.Module):
|
| 149 |
+
""" Image to Patch Embedding
|
| 150 |
+
"""
|
| 151 |
+
|
| 152 |
+
def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
|
| 153 |
+
super().__init__()
|
| 154 |
+
img_size = to_2tuple(img_size)
|
| 155 |
+
patch_size = to_2tuple(patch_size)
|
| 156 |
+
|
| 157 |
+
self.img_size = img_size
|
| 158 |
+
self.patch_size = patch_size
|
| 159 |
+
self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
|
| 160 |
+
self.num_patches = self.H * self.W
|
| 161 |
+
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
|
| 162 |
+
padding=(patch_size[0] // 2, patch_size[1] // 2))
|
| 163 |
+
self.norm = nn.LayerNorm(embed_dim)
|
| 164 |
+
|
| 165 |
+
self.apply(self._init_weights)
|
| 166 |
+
|
| 167 |
+
def _init_weights(self, m):
|
| 168 |
+
if isinstance(m, nn.Linear):
|
| 169 |
+
trunc_normal_(m.weight, std=.02)
|
| 170 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
| 171 |
+
nn.init.constant_(m.bias, 0)
|
| 172 |
+
elif isinstance(m, nn.LayerNorm):
|
| 173 |
+
nn.init.constant_(m.bias, 0)
|
| 174 |
+
nn.init.constant_(m.weight, 1.0)
|
| 175 |
+
elif isinstance(m, nn.Conv2d):
|
| 176 |
+
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
| 177 |
+
fan_out //= m.groups
|
| 178 |
+
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
|
| 179 |
+
if m.bias is not None:
|
| 180 |
+
m.bias.data.zero_()
|
| 181 |
+
|
| 182 |
+
def forward(self, x):
|
| 183 |
+
x = self.proj(x)
|
| 184 |
+
_, _, H, W = x.shape
|
| 185 |
+
x = x.flatten(2).transpose(1, 2)
|
| 186 |
+
x = self.norm(x)
|
| 187 |
+
|
| 188 |
+
return x, H, W
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
class MixVisionTransformer(Backbone):
|
| 192 |
+
def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
|
| 193 |
+
num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
|
| 194 |
+
attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
|
| 195 |
+
depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1]):
|
| 196 |
+
super().__init__()
|
| 197 |
+
self.num_classes = num_classes
|
| 198 |
+
self.depths = depths
|
| 199 |
+
|
| 200 |
+
# patch_embed
|
| 201 |
+
self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans,
|
| 202 |
+
embed_dim=embed_dims[0])
|
| 203 |
+
self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dims[0],
|
| 204 |
+
embed_dim=embed_dims[1])
|
| 205 |
+
self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dims[1],
|
| 206 |
+
embed_dim=embed_dims[2])
|
| 207 |
+
self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dims[2],
|
| 208 |
+
embed_dim=embed_dims[3])
|
| 209 |
+
|
| 210 |
+
# transformer encoder
|
| 211 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
|
| 212 |
+
cur = 0
|
| 213 |
+
self.block1 = nn.ModuleList([Block(
|
| 214 |
+
dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
|
| 215 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
|
| 216 |
+
sr_ratio=sr_ratios[0])
|
| 217 |
+
for i in range(depths[0])])
|
| 218 |
+
self.norm1 = norm_layer(embed_dims[0])
|
| 219 |
+
|
| 220 |
+
cur += depths[0]
|
| 221 |
+
self.block2 = nn.ModuleList([Block(
|
| 222 |
+
dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
|
| 223 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
|
| 224 |
+
sr_ratio=sr_ratios[1])
|
| 225 |
+
for i in range(depths[1])])
|
| 226 |
+
self.norm2 = norm_layer(embed_dims[1])
|
| 227 |
+
|
| 228 |
+
cur += depths[1]
|
| 229 |
+
self.block3 = nn.ModuleList([Block(
|
| 230 |
+
dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
|
| 231 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
|
| 232 |
+
sr_ratio=sr_ratios[2])
|
| 233 |
+
for i in range(depths[2])])
|
| 234 |
+
self.norm3 = norm_layer(embed_dims[2])
|
| 235 |
+
|
| 236 |
+
cur += depths[2]
|
| 237 |
+
self.block4 = nn.ModuleList([Block(
|
| 238 |
+
dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
|
| 239 |
+
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
|
| 240 |
+
sr_ratio=sr_ratios[3])
|
| 241 |
+
for i in range(depths[3])])
|
| 242 |
+
self.norm4 = norm_layer(embed_dims[3])
|
| 243 |
+
|
| 244 |
+
# classification head
|
| 245 |
+
# self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
|
| 246 |
+
|
| 247 |
+
self.apply(self._init_weights)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# freeze
|
| 251 |
+
for p in self.patch_embed1.parameters():
|
| 252 |
+
p.requires_grad = False
|
| 253 |
+
for p in self.block1.parameters():
|
| 254 |
+
p.requires_grad = False
|
| 255 |
+
for p in self.norm1.parameters():
|
| 256 |
+
p.requires_grad = False
|
| 257 |
+
|
| 258 |
+
outs = self.forward(torch.rand(1,3,224,224).float())
|
| 259 |
+
self.output_shapes = dict()
|
| 260 |
+
self._size_divisibility = 0
|
| 261 |
+
for i, f in enumerate(outs):
|
| 262 |
+
self.output_shapes[f] = ShapeSpec(
|
| 263 |
+
channels=outs[f].shape[1], stride=224//outs[f].shape[2]
|
| 264 |
+
)
|
| 265 |
+
if i == (len(outs)-1):
|
| 266 |
+
self._size_divisibility = 224//outs[f].shape[2]
|
| 267 |
+
|
| 268 |
+
self.train()
|
| 269 |
+
|
| 270 |
+
def output_shape(self):
|
| 271 |
+
return self.output_shapes
|
| 272 |
+
|
| 273 |
+
def _init_weights(self, m):
|
| 274 |
+
if isinstance(m, nn.Linear):
|
| 275 |
+
trunc_normal_(m.weight, std=.02)
|
| 276 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
| 277 |
+
nn.init.constant_(m.bias, 0)
|
| 278 |
+
elif isinstance(m, nn.LayerNorm):
|
| 279 |
+
nn.init.constant_(m.bias, 0)
|
| 280 |
+
nn.init.constant_(m.weight, 1.0)
|
| 281 |
+
elif isinstance(m, nn.Conv2d):
|
| 282 |
+
fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
| 283 |
+
fan_out //= m.groups
|
| 284 |
+
m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
|
| 285 |
+
if m.bias is not None:
|
| 286 |
+
m.bias.data.zero_()
|
| 287 |
+
|
| 288 |
+
def reset_drop_path(self, drop_path_rate):
|
| 289 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
|
| 290 |
+
cur = 0
|
| 291 |
+
for i in range(self.depths[0]):
|
| 292 |
+
self.block1[i].drop_path.drop_prob = dpr[cur + i]
|
| 293 |
+
|
| 294 |
+
cur += self.depths[0]
|
| 295 |
+
for i in range(self.depths[1]):
|
| 296 |
+
self.block2[i].drop_path.drop_prob = dpr[cur + i]
|
| 297 |
+
|
| 298 |
+
cur += self.depths[1]
|
| 299 |
+
for i in range(self.depths[2]):
|
| 300 |
+
self.block3[i].drop_path.drop_prob = dpr[cur + i]
|
| 301 |
+
|
| 302 |
+
cur += self.depths[2]
|
| 303 |
+
for i in range(self.depths[3]):
|
| 304 |
+
self.block4[i].drop_path.drop_prob = dpr[cur + i]
|
| 305 |
+
|
| 306 |
+
def freeze_patch_emb(self):
|
| 307 |
+
self.patch_embed1.requires_grad = False
|
| 308 |
+
|
| 309 |
+
@torch.jit.ignore
|
| 310 |
+
def no_weight_decay(self):
|
| 311 |
+
return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better
|
| 312 |
+
|
| 313 |
+
def get_classifier(self):
|
| 314 |
+
return self.head
|
| 315 |
+
|
| 316 |
+
def reset_classifier(self, num_classes, global_pool=''):
|
| 317 |
+
self.num_classes = num_classes
|
| 318 |
+
self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
|
| 319 |
+
|
| 320 |
+
def forward_features(self, x):
|
| 321 |
+
B = x.shape[0]
|
| 322 |
+
outs = dict()
|
| 323 |
+
|
| 324 |
+
# stage 1
|
| 325 |
+
x, H, W = self.patch_embed1(x)
|
| 326 |
+
for i, blk in enumerate(self.block1):
|
| 327 |
+
x = blk(x, H, W)
|
| 328 |
+
x = self.norm1(x)
|
| 329 |
+
x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
|
| 330 |
+
outs["mit1"] = x
|
| 331 |
+
|
| 332 |
+
# stage 2
|
| 333 |
+
x, H, W = self.patch_embed2(x)
|
| 334 |
+
for i, blk in enumerate(self.block2):
|
| 335 |
+
x = blk(x, H, W)
|
| 336 |
+
x = self.norm2(x)
|
| 337 |
+
x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
|
| 338 |
+
outs["mit2"] = x
|
| 339 |
+
|
| 340 |
+
# stage 3
|
| 341 |
+
x, H, W = self.patch_embed3(x)
|
| 342 |
+
for i, blk in enumerate(self.block3):
|
| 343 |
+
x = blk(x, H, W)
|
| 344 |
+
x = self.norm3(x)
|
| 345 |
+
x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
|
| 346 |
+
outs["mit3"] = x
|
| 347 |
+
|
| 348 |
+
# stage 4
|
| 349 |
+
x, H, W = self.patch_embed4(x)
|
| 350 |
+
for i, blk in enumerate(self.block4):
|
| 351 |
+
x = blk(x, H, W)
|
| 352 |
+
x = self.norm4(x)
|
| 353 |
+
x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
|
| 354 |
+
outs["mit4"] = x
|
| 355 |
+
|
| 356 |
+
return outs
|
| 357 |
+
|
| 358 |
+
def forward(self, x):
|
| 359 |
+
x = self.forward_features(x)
|
| 360 |
+
# x = self.head(x)
|
| 361 |
+
|
| 362 |
+
return x
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
class DWConv(nn.Module):
|
| 366 |
+
def __init__(self, dim=768):
|
| 367 |
+
super(DWConv, self).__init__()
|
| 368 |
+
self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
|
| 369 |
+
|
| 370 |
+
def forward(self, x, H, W):
|
| 371 |
+
B, N, C = x.shape
|
| 372 |
+
x = x.transpose(1, 2).contiguous().view(B, C, H, W)
|
| 373 |
+
x = self.dwconv(x)
|
| 374 |
+
x = x.flatten(2).transpose(1, 2)
|
| 375 |
+
|
| 376 |
+
return x
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
class mit_b0(MixVisionTransformer):
|
| 381 |
+
def __init__(self, **kwargs):
|
| 382 |
+
super(mit_b0, self).__init__(
|
| 383 |
+
patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
|
| 384 |
+
qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
|
| 385 |
+
drop_rate=0.0, drop_path_rate=0.1)
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
class mit_b1(MixVisionTransformer):
|
| 389 |
+
def __init__(self, **kwargs):
|
| 390 |
+
super(mit_b1, self).__init__(
|
| 391 |
+
patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
|
| 392 |
+
qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
|
| 393 |
+
drop_rate=0.0, drop_path_rate=0.1)
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
class mit_b2(MixVisionTransformer):
|
| 397 |
+
def __init__(self, **kwargs):
|
| 398 |
+
super(mit_b2, self).__init__(
|
| 399 |
+
patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
|
| 400 |
+
qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
|
| 401 |
+
drop_rate=0.0, drop_path_rate=0.1)
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
class mit_b3(MixVisionTransformer):
|
| 405 |
+
def __init__(self, **kwargs):
|
| 406 |
+
super(mit_b3, self).__init__(
|
| 407 |
+
patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
|
| 408 |
+
qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
|
| 409 |
+
drop_rate=0.0, drop_path_rate=0.1)
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
class mit_b4(MixVisionTransformer):
|
| 413 |
+
def __init__(self, **kwargs):
|
| 414 |
+
super(mit_b4, self).__init__(
|
| 415 |
+
patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
|
| 416 |
+
qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
|
| 417 |
+
drop_rate=0.0, drop_path_rate=0.1)
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
class mit_b5(MixVisionTransformer):
|
| 421 |
+
def __init__(self, **kwargs):
|
| 422 |
+
super(mit_b5, self).__init__(
|
| 423 |
+
patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
|
| 424 |
+
qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
|
| 425 |
+
drop_rate=0.0, drop_path_rate=0.1)
|
| 426 |
+
|
| 427 |
+
@BACKBONE_REGISTRY.register()
|
| 428 |
+
def build_mit_backbone(cfg, input_shape):
|
| 429 |
+
if cfg.MODEL.MIT_BACKBONE.NAME == "b0":
|
| 430 |
+
return mit_b0()
|
| 431 |
+
elif cfg.MODEL.MIT_BACKBONE.NAME == "b1":
|
| 432 |
+
return mit_b1()
|
| 433 |
+
elif cfg.MODEL.MIT_BACKBONE.NAME == "b2":
|
| 434 |
+
return mit_b2()
|
| 435 |
+
elif cfg.MODEL.MIT_BACKBONE.NAME == "b3":
|
| 436 |
+
return mit_b3()
|
| 437 |
+
elif cfg.MODEL.MIT_BACKBONE.NAME == "b4":
|
| 438 |
+
return mit_b4()
|
| 439 |
+
elif cfg.MODEL.MIT_BACKBONE.NAME == "b5":
|
| 440 |
+
return mit_b5()
|
| 441 |
+
|
| 442 |
+
@BACKBONE_REGISTRY.register()
|
| 443 |
+
def build_retinanet_mit_fpn_backbone(cfg, input_shape: ShapeSpec):
|
| 444 |
+
"""
|
| 445 |
+
Args:
|
| 446 |
+
cfg: a detectron2 CfgNode
|
| 447 |
+
Returns:
|
| 448 |
+
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
|
| 449 |
+
"""
|
| 450 |
+
bottom_up = build_mit_backbone(cfg, input_shape)
|
| 451 |
+
in_features = cfg.MODEL.FPN.IN_FEATURES
|
| 452 |
+
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
|
| 453 |
+
in_channels_top = out_channels
|
| 454 |
+
top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
|
| 455 |
+
backbone = FPN(
|
| 456 |
+
bottom_up=bottom_up,
|
| 457 |
+
in_features=in_features,
|
| 458 |
+
out_channels=out_channels,
|
| 459 |
+
norm=cfg.MODEL.FPN.NORM,
|
| 460 |
+
top_block=top_block,
|
| 461 |
+
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
|
| 462 |
+
)
|
| 463 |
+
return backbone
|
| 464 |
+
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/swin.py
ADDED
|
@@ -0,0 +1,723 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# Swin Transformer
|
| 3 |
+
# modified from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py
|
| 4 |
+
# --------------------------------------------------------
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
import torch.nn as nn
|
| 8 |
+
import torch.nn.functional as F
|
| 9 |
+
import torch.utils.checkpoint as checkpoint
|
| 10 |
+
import numpy as np
|
| 11 |
+
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
|
| 12 |
+
|
| 13 |
+
from detectron2.modeling.backbone import Backbone
|
| 14 |
+
from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
|
| 15 |
+
from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool, LastLevelP6P7
|
| 16 |
+
from detectron2.layers import ShapeSpec
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class Mlp(nn.Module):
|
| 20 |
+
""" Multilayer perceptron."""
|
| 21 |
+
|
| 22 |
+
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
|
| 23 |
+
super().__init__()
|
| 24 |
+
out_features = out_features or in_features
|
| 25 |
+
hidden_features = hidden_features or in_features
|
| 26 |
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
| 27 |
+
self.act = act_layer()
|
| 28 |
+
self.fc2 = nn.Linear(hidden_features, out_features)
|
| 29 |
+
self.drop = nn.Dropout(drop)
|
| 30 |
+
|
| 31 |
+
def forward(self, x):
|
| 32 |
+
x = self.fc1(x)
|
| 33 |
+
x = self.act(x)
|
| 34 |
+
x = self.drop(x)
|
| 35 |
+
x = self.fc2(x)
|
| 36 |
+
x = self.drop(x)
|
| 37 |
+
return x
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def window_partition(x, window_size):
|
| 41 |
+
"""
|
| 42 |
+
Args:
|
| 43 |
+
x: (B, H, W, C)
|
| 44 |
+
window_size (int): window size
|
| 45 |
+
Returns:
|
| 46 |
+
windows: (num_windows*B, window_size, window_size, C)
|
| 47 |
+
"""
|
| 48 |
+
B, H, W, C = x.shape
|
| 49 |
+
x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
|
| 50 |
+
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
|
| 51 |
+
return windows
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def window_reverse(windows, window_size, H, W):
|
| 55 |
+
"""
|
| 56 |
+
Args:
|
| 57 |
+
windows: (num_windows*B, window_size, window_size, C)
|
| 58 |
+
window_size (int): Window size
|
| 59 |
+
H (int): Height of image
|
| 60 |
+
W (int): Width of image
|
| 61 |
+
Returns:
|
| 62 |
+
x: (B, H, W, C)
|
| 63 |
+
"""
|
| 64 |
+
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
| 65 |
+
x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
|
| 66 |
+
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
|
| 67 |
+
return x
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class WindowAttention(nn.Module):
|
| 71 |
+
""" Window based multi-head self attention (W-MSA) module with relative position bias.
|
| 72 |
+
It supports both of shifted and non-shifted window.
|
| 73 |
+
Args:
|
| 74 |
+
dim (int): Number of input channels.
|
| 75 |
+
window_size (tuple[int]): The height and width of the window.
|
| 76 |
+
num_heads (int): Number of attention heads.
|
| 77 |
+
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
| 78 |
+
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
|
| 79 |
+
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
|
| 80 |
+
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
|
| 84 |
+
|
| 85 |
+
super().__init__()
|
| 86 |
+
self.dim = dim
|
| 87 |
+
self.window_size = window_size # Wh, Ww
|
| 88 |
+
self.num_heads = num_heads
|
| 89 |
+
head_dim = dim // num_heads
|
| 90 |
+
self.scale = qk_scale or head_dim ** -0.5
|
| 91 |
+
|
| 92 |
+
# define a parameter table of relative position bias
|
| 93 |
+
self.relative_position_bias_table = nn.Parameter(
|
| 94 |
+
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
|
| 95 |
+
|
| 96 |
+
# get pair-wise relative position index for each token inside the window
|
| 97 |
+
coords_h = torch.arange(self.window_size[0])
|
| 98 |
+
coords_w = torch.arange(self.window_size[1])
|
| 99 |
+
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
|
| 100 |
+
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
|
| 101 |
+
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
|
| 102 |
+
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
|
| 103 |
+
relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
|
| 104 |
+
relative_coords[:, :, 1] += self.window_size[1] - 1
|
| 105 |
+
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
| 106 |
+
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
|
| 107 |
+
self.register_buffer("relative_position_index", relative_position_index)
|
| 108 |
+
|
| 109 |
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
| 110 |
+
self.attn_drop = nn.Dropout(attn_drop)
|
| 111 |
+
self.proj = nn.Linear(dim, dim)
|
| 112 |
+
self.proj_drop = nn.Dropout(proj_drop)
|
| 113 |
+
|
| 114 |
+
trunc_normal_(self.relative_position_bias_table, std=.02)
|
| 115 |
+
self.softmax = nn.Softmax(dim=-1)
|
| 116 |
+
|
| 117 |
+
def forward(self, x, mask=None):
|
| 118 |
+
""" Forward function.
|
| 119 |
+
Args:
|
| 120 |
+
x: input features with shape of (num_windows*B, N, C)
|
| 121 |
+
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
|
| 122 |
+
"""
|
| 123 |
+
B_, N, C = x.shape
|
| 124 |
+
qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 125 |
+
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
|
| 126 |
+
|
| 127 |
+
q = q * self.scale
|
| 128 |
+
attn = (q @ k.transpose(-2, -1))
|
| 129 |
+
|
| 130 |
+
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
|
| 131 |
+
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
|
| 132 |
+
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
|
| 133 |
+
attn = attn + relative_position_bias.unsqueeze(0)
|
| 134 |
+
|
| 135 |
+
if mask is not None:
|
| 136 |
+
nW = mask.shape[0]
|
| 137 |
+
attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
|
| 138 |
+
attn = attn.view(-1, self.num_heads, N, N)
|
| 139 |
+
attn = self.softmax(attn)
|
| 140 |
+
else:
|
| 141 |
+
attn = self.softmax(attn)
|
| 142 |
+
|
| 143 |
+
attn = self.attn_drop(attn)
|
| 144 |
+
|
| 145 |
+
x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
|
| 146 |
+
x = self.proj(x)
|
| 147 |
+
x = self.proj_drop(x)
|
| 148 |
+
return x
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
class SwinTransformerBlock(nn.Module):
|
| 152 |
+
""" Swin Transformer Block.
|
| 153 |
+
Args:
|
| 154 |
+
dim (int): Number of input channels.
|
| 155 |
+
num_heads (int): Number of attention heads.
|
| 156 |
+
window_size (int): Window size.
|
| 157 |
+
shift_size (int): Shift size for SW-MSA.
|
| 158 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
| 159 |
+
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
| 160 |
+
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
| 161 |
+
drop (float, optional): Dropout rate. Default: 0.0
|
| 162 |
+
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
| 163 |
+
drop_path (float, optional): Stochastic depth rate. Default: 0.0
|
| 164 |
+
act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
|
| 165 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
|
| 166 |
+
"""
|
| 167 |
+
|
| 168 |
+
def __init__(self, dim, num_heads, window_size=7, shift_size=0,
|
| 169 |
+
mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
|
| 170 |
+
act_layer=nn.GELU, norm_layer=nn.LayerNorm):
|
| 171 |
+
super().__init__()
|
| 172 |
+
self.dim = dim
|
| 173 |
+
self.num_heads = num_heads
|
| 174 |
+
self.window_size = window_size
|
| 175 |
+
self.shift_size = shift_size
|
| 176 |
+
self.mlp_ratio = mlp_ratio
|
| 177 |
+
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
|
| 178 |
+
|
| 179 |
+
self.norm1 = norm_layer(dim)
|
| 180 |
+
self.attn = WindowAttention(
|
| 181 |
+
dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
|
| 182 |
+
qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
|
| 183 |
+
|
| 184 |
+
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
|
| 185 |
+
self.norm2 = norm_layer(dim)
|
| 186 |
+
mlp_hidden_dim = int(dim * mlp_ratio)
|
| 187 |
+
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
|
| 188 |
+
|
| 189 |
+
self.H = None
|
| 190 |
+
self.W = None
|
| 191 |
+
|
| 192 |
+
def forward(self, x, mask_matrix):
|
| 193 |
+
""" Forward function.
|
| 194 |
+
Args:
|
| 195 |
+
x: Input feature, tensor size (B, H*W, C).
|
| 196 |
+
H, W: Spatial resolution of the input feature.
|
| 197 |
+
mask_matrix: Attention mask for cyclic shift.
|
| 198 |
+
"""
|
| 199 |
+
B, L, C = x.shape
|
| 200 |
+
H, W = self.H, self.W
|
| 201 |
+
assert L == H * W, "input feature has wrong size"
|
| 202 |
+
|
| 203 |
+
shortcut = x
|
| 204 |
+
x = self.norm1(x)
|
| 205 |
+
x = x.view(B, H, W, C)
|
| 206 |
+
|
| 207 |
+
# pad feature maps to multiples of window size
|
| 208 |
+
pad_l = pad_t = 0
|
| 209 |
+
pad_r = (self.window_size - W % self.window_size) % self.window_size
|
| 210 |
+
pad_b = (self.window_size - H % self.window_size) % self.window_size
|
| 211 |
+
x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
|
| 212 |
+
_, Hp, Wp, _ = x.shape
|
| 213 |
+
|
| 214 |
+
# cyclic shift
|
| 215 |
+
if self.shift_size > 0:
|
| 216 |
+
shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
|
| 217 |
+
attn_mask = mask_matrix
|
| 218 |
+
else:
|
| 219 |
+
shifted_x = x
|
| 220 |
+
attn_mask = None
|
| 221 |
+
|
| 222 |
+
# partition windows
|
| 223 |
+
x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
|
| 224 |
+
x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
|
| 225 |
+
|
| 226 |
+
# W-MSA/SW-MSA
|
| 227 |
+
attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
|
| 228 |
+
|
| 229 |
+
# merge windows
|
| 230 |
+
attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
|
| 231 |
+
shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
|
| 232 |
+
|
| 233 |
+
# reverse cyclic shift
|
| 234 |
+
if self.shift_size > 0:
|
| 235 |
+
x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
|
| 236 |
+
else:
|
| 237 |
+
x = shifted_x
|
| 238 |
+
|
| 239 |
+
if pad_r > 0 or pad_b > 0:
|
| 240 |
+
x = x[:, :H, :W, :].contiguous()
|
| 241 |
+
|
| 242 |
+
x = x.view(B, H * W, C)
|
| 243 |
+
|
| 244 |
+
# FFN
|
| 245 |
+
x = shortcut + self.drop_path(x)
|
| 246 |
+
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
| 247 |
+
|
| 248 |
+
return x
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
class PatchMerging(nn.Module):
|
| 252 |
+
""" Patch Merging Layer
|
| 253 |
+
Args:
|
| 254 |
+
dim (int): Number of input channels.
|
| 255 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
|
| 256 |
+
"""
|
| 257 |
+
def __init__(self, dim, norm_layer=nn.LayerNorm):
|
| 258 |
+
super().__init__()
|
| 259 |
+
self.dim = dim
|
| 260 |
+
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
|
| 261 |
+
self.norm = norm_layer(4 * dim)
|
| 262 |
+
|
| 263 |
+
def forward(self, x, H, W):
|
| 264 |
+
""" Forward function.
|
| 265 |
+
Args:
|
| 266 |
+
x: Input feature, tensor size (B, H*W, C).
|
| 267 |
+
H, W: Spatial resolution of the input feature.
|
| 268 |
+
"""
|
| 269 |
+
B, L, C = x.shape
|
| 270 |
+
assert L == H * W, "input feature has wrong size"
|
| 271 |
+
|
| 272 |
+
x = x.view(B, H, W, C)
|
| 273 |
+
|
| 274 |
+
# padding
|
| 275 |
+
pad_input = (H % 2 == 1) or (W % 2 == 1)
|
| 276 |
+
if pad_input:
|
| 277 |
+
x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
|
| 278 |
+
|
| 279 |
+
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
|
| 280 |
+
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
|
| 281 |
+
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
|
| 282 |
+
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
|
| 283 |
+
x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
|
| 284 |
+
x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
|
| 285 |
+
|
| 286 |
+
x = self.norm(x)
|
| 287 |
+
x = self.reduction(x)
|
| 288 |
+
|
| 289 |
+
return x
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
class BasicLayer(nn.Module):
|
| 293 |
+
""" A basic Swin Transformer layer for one stage.
|
| 294 |
+
Args:
|
| 295 |
+
dim (int): Number of feature channels
|
| 296 |
+
depth (int): Depths of this stage.
|
| 297 |
+
num_heads (int): Number of attention head.
|
| 298 |
+
window_size (int): Local window size. Default: 7.
|
| 299 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
|
| 300 |
+
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
| 301 |
+
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
| 302 |
+
drop (float, optional): Dropout rate. Default: 0.0
|
| 303 |
+
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
| 304 |
+
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
|
| 305 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
|
| 306 |
+
downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
|
| 307 |
+
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
|
| 308 |
+
"""
|
| 309 |
+
|
| 310 |
+
def __init__(self,
|
| 311 |
+
dim,
|
| 312 |
+
depth,
|
| 313 |
+
num_heads,
|
| 314 |
+
window_size=7,
|
| 315 |
+
mlp_ratio=4.,
|
| 316 |
+
qkv_bias=True,
|
| 317 |
+
qk_scale=None,
|
| 318 |
+
drop=0.,
|
| 319 |
+
attn_drop=0.,
|
| 320 |
+
drop_path=0.,
|
| 321 |
+
norm_layer=nn.LayerNorm,
|
| 322 |
+
downsample=None,
|
| 323 |
+
use_checkpoint=False):
|
| 324 |
+
super().__init__()
|
| 325 |
+
self.window_size = window_size
|
| 326 |
+
self.shift_size = window_size // 2
|
| 327 |
+
self.depth = depth
|
| 328 |
+
self.use_checkpoint = use_checkpoint
|
| 329 |
+
|
| 330 |
+
# build blocks
|
| 331 |
+
self.blocks = nn.ModuleList([
|
| 332 |
+
SwinTransformerBlock(
|
| 333 |
+
dim=dim,
|
| 334 |
+
num_heads=num_heads,
|
| 335 |
+
window_size=window_size,
|
| 336 |
+
shift_size=0 if (i % 2 == 0) else window_size // 2,
|
| 337 |
+
mlp_ratio=mlp_ratio,
|
| 338 |
+
qkv_bias=qkv_bias,
|
| 339 |
+
qk_scale=qk_scale,
|
| 340 |
+
drop=drop,
|
| 341 |
+
attn_drop=attn_drop,
|
| 342 |
+
drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
|
| 343 |
+
norm_layer=norm_layer)
|
| 344 |
+
for i in range(depth)])
|
| 345 |
+
|
| 346 |
+
# patch merging layer
|
| 347 |
+
if downsample is not None:
|
| 348 |
+
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
|
| 349 |
+
else:
|
| 350 |
+
self.downsample = None
|
| 351 |
+
|
| 352 |
+
def forward(self, x, H, W):
|
| 353 |
+
""" Forward function.
|
| 354 |
+
Args:
|
| 355 |
+
x: Input feature, tensor size (B, H*W, C).
|
| 356 |
+
H, W: Spatial resolution of the input feature.
|
| 357 |
+
"""
|
| 358 |
+
|
| 359 |
+
# calculate attention mask for SW-MSA
|
| 360 |
+
Hp = int(np.ceil(H / self.window_size)) * self.window_size
|
| 361 |
+
Wp = int(np.ceil(W / self.window_size)) * self.window_size
|
| 362 |
+
img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
|
| 363 |
+
h_slices = (slice(0, -self.window_size),
|
| 364 |
+
slice(-self.window_size, -self.shift_size),
|
| 365 |
+
slice(-self.shift_size, None))
|
| 366 |
+
w_slices = (slice(0, -self.window_size),
|
| 367 |
+
slice(-self.window_size, -self.shift_size),
|
| 368 |
+
slice(-self.shift_size, None))
|
| 369 |
+
cnt = 0
|
| 370 |
+
for h in h_slices:
|
| 371 |
+
for w in w_slices:
|
| 372 |
+
img_mask[:, h, w, :] = cnt
|
| 373 |
+
cnt += 1
|
| 374 |
+
|
| 375 |
+
mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
|
| 376 |
+
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
|
| 377 |
+
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
|
| 378 |
+
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
|
| 379 |
+
|
| 380 |
+
for blk in self.blocks:
|
| 381 |
+
blk.H, blk.W = H, W
|
| 382 |
+
if self.use_checkpoint:
|
| 383 |
+
x = checkpoint.checkpoint(blk, x, attn_mask)
|
| 384 |
+
else:
|
| 385 |
+
x = blk(x, attn_mask)
|
| 386 |
+
if self.downsample is not None:
|
| 387 |
+
x_down = self.downsample(x, H, W)
|
| 388 |
+
Wh, Ww = (H + 1) // 2, (W + 1) // 2
|
| 389 |
+
return x, H, W, x_down, Wh, Ww
|
| 390 |
+
else:
|
| 391 |
+
return x, H, W, x, H, W
|
| 392 |
+
|
| 393 |
+
|
| 394 |
+
class PatchEmbed(nn.Module):
|
| 395 |
+
""" Image to Patch Embedding
|
| 396 |
+
Args:
|
| 397 |
+
patch_size (int): Patch token size. Default: 4.
|
| 398 |
+
in_chans (int): Number of input image channels. Default: 3.
|
| 399 |
+
embed_dim (int): Number of linear projection output channels. Default: 96.
|
| 400 |
+
norm_layer (nn.Module, optional): Normalization layer. Default: None
|
| 401 |
+
"""
|
| 402 |
+
|
| 403 |
+
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
|
| 404 |
+
super().__init__()
|
| 405 |
+
patch_size = to_2tuple(patch_size)
|
| 406 |
+
self.patch_size = patch_size
|
| 407 |
+
|
| 408 |
+
self.in_chans = in_chans
|
| 409 |
+
self.embed_dim = embed_dim
|
| 410 |
+
|
| 411 |
+
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
| 412 |
+
if norm_layer is not None:
|
| 413 |
+
self.norm = norm_layer(embed_dim)
|
| 414 |
+
else:
|
| 415 |
+
self.norm = None
|
| 416 |
+
|
| 417 |
+
def forward(self, x):
|
| 418 |
+
"""Forward function."""
|
| 419 |
+
# padding
|
| 420 |
+
_, _, H, W = x.size()
|
| 421 |
+
if W % self.patch_size[1] != 0:
|
| 422 |
+
x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
|
| 423 |
+
if H % self.patch_size[0] != 0:
|
| 424 |
+
x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
|
| 425 |
+
|
| 426 |
+
x = self.proj(x) # B C Wh Ww
|
| 427 |
+
if self.norm is not None:
|
| 428 |
+
Wh, Ww = x.size(2), x.size(3)
|
| 429 |
+
x = x.flatten(2).transpose(1, 2)
|
| 430 |
+
x = self.norm(x)
|
| 431 |
+
x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
|
| 432 |
+
|
| 433 |
+
return x
|
| 434 |
+
|
| 435 |
+
|
| 436 |
+
class SwinTransformer(Backbone):
|
| 437 |
+
""" Swin Transformer backbone.
|
| 438 |
+
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
|
| 439 |
+
https://arxiv.org/pdf/2103.14030
|
| 440 |
+
Args:
|
| 441 |
+
pretrain_img_size (int): Input image size for training the pretrained model,
|
| 442 |
+
used in absolute postion embedding. Default 224.
|
| 443 |
+
patch_size (int | tuple(int)): Patch size. Default: 4.
|
| 444 |
+
in_chans (int): Number of input image channels. Default: 3.
|
| 445 |
+
embed_dim (int): Number of linear projection output channels. Default: 96.
|
| 446 |
+
depths (tuple[int]): Depths of each Swin Transformer stage.
|
| 447 |
+
num_heads (tuple[int]): Number of attention head of each stage.
|
| 448 |
+
window_size (int): Window size. Default: 7.
|
| 449 |
+
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
|
| 450 |
+
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
|
| 451 |
+
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
|
| 452 |
+
drop_rate (float): Dropout rate.
|
| 453 |
+
attn_drop_rate (float): Attention dropout rate. Default: 0.
|
| 454 |
+
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
|
| 455 |
+
norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
|
| 456 |
+
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
|
| 457 |
+
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
|
| 458 |
+
out_indices (Sequence[int]): Output from which stages.
|
| 459 |
+
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
|
| 460 |
+
-1 means not freezing any parameters.
|
| 461 |
+
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
|
| 462 |
+
"""
|
| 463 |
+
|
| 464 |
+
def __init__(self,
|
| 465 |
+
pretrain_img_size=224,
|
| 466 |
+
patch_size=4,
|
| 467 |
+
in_chans=3,
|
| 468 |
+
embed_dim=96,
|
| 469 |
+
depths=[2, 2, 6, 2],
|
| 470 |
+
num_heads=[3, 6, 12, 24],
|
| 471 |
+
window_size=7,
|
| 472 |
+
mlp_ratio=4.,
|
| 473 |
+
qkv_bias=True,
|
| 474 |
+
qk_scale=None,
|
| 475 |
+
drop_rate=0.,
|
| 476 |
+
attn_drop_rate=0.,
|
| 477 |
+
drop_path_rate=0.2,
|
| 478 |
+
norm_layer=nn.LayerNorm,
|
| 479 |
+
ape=False,
|
| 480 |
+
patch_norm=True,
|
| 481 |
+
frozen_stages=-1,
|
| 482 |
+
use_checkpoint=False,
|
| 483 |
+
out_features=None):
|
| 484 |
+
super(SwinTransformer, self).__init__()
|
| 485 |
+
|
| 486 |
+
self.pretrain_img_size = pretrain_img_size
|
| 487 |
+
self.num_layers = len(depths)
|
| 488 |
+
self.embed_dim = embed_dim
|
| 489 |
+
self.ape = ape
|
| 490 |
+
self.patch_norm = patch_norm
|
| 491 |
+
self.frozen_stages = frozen_stages
|
| 492 |
+
|
| 493 |
+
self.out_features = out_features
|
| 494 |
+
|
| 495 |
+
# split image into non-overlapping patches
|
| 496 |
+
self.patch_embed = PatchEmbed(
|
| 497 |
+
patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
|
| 498 |
+
norm_layer=norm_layer if self.patch_norm else None)
|
| 499 |
+
|
| 500 |
+
# absolute position embedding
|
| 501 |
+
if self.ape:
|
| 502 |
+
pretrain_img_size = to_2tuple(pretrain_img_size)
|
| 503 |
+
patch_size = to_2tuple(patch_size)
|
| 504 |
+
patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
|
| 505 |
+
|
| 506 |
+
self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
|
| 507 |
+
trunc_normal_(self.absolute_pos_embed, std=.02)
|
| 508 |
+
|
| 509 |
+
self.pos_drop = nn.Dropout(p=drop_rate)
|
| 510 |
+
|
| 511 |
+
# stochastic depth
|
| 512 |
+
dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
|
| 513 |
+
|
| 514 |
+
self._out_feature_strides = {}
|
| 515 |
+
self._out_feature_channels = {}
|
| 516 |
+
|
| 517 |
+
# build layers
|
| 518 |
+
self.layers = nn.ModuleList()
|
| 519 |
+
for i_layer in range(self.num_layers):
|
| 520 |
+
layer = BasicLayer(
|
| 521 |
+
dim=int(embed_dim * 2 ** i_layer),
|
| 522 |
+
depth=depths[i_layer],
|
| 523 |
+
num_heads=num_heads[i_layer],
|
| 524 |
+
window_size=window_size,
|
| 525 |
+
mlp_ratio=mlp_ratio,
|
| 526 |
+
qkv_bias=qkv_bias,
|
| 527 |
+
qk_scale=qk_scale,
|
| 528 |
+
drop=drop_rate,
|
| 529 |
+
attn_drop=attn_drop_rate,
|
| 530 |
+
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
|
| 531 |
+
norm_layer=norm_layer,
|
| 532 |
+
downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
|
| 533 |
+
use_checkpoint=use_checkpoint)
|
| 534 |
+
self.layers.append(layer)
|
| 535 |
+
|
| 536 |
+
stage = f'stage{i_layer+2}'
|
| 537 |
+
if stage in self.out_features:
|
| 538 |
+
self._out_feature_channels[stage] = embed_dim * 2 ** i_layer
|
| 539 |
+
self._out_feature_strides[stage] = 4 * 2 ** i_layer
|
| 540 |
+
|
| 541 |
+
num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
|
| 542 |
+
self.num_features = num_features
|
| 543 |
+
|
| 544 |
+
# add a norm layer for each output
|
| 545 |
+
for i_layer in range(self.num_layers):
|
| 546 |
+
stage = f'stage{i_layer+2}'
|
| 547 |
+
if stage in self.out_features:
|
| 548 |
+
layer = norm_layer(num_features[i_layer])
|
| 549 |
+
layer_name = f'norm{i_layer}'
|
| 550 |
+
self.add_module(layer_name, layer)
|
| 551 |
+
|
| 552 |
+
self._freeze_stages()
|
| 553 |
+
|
| 554 |
+
def _freeze_stages(self):
|
| 555 |
+
if self.frozen_stages >= 0:
|
| 556 |
+
self.patch_embed.eval()
|
| 557 |
+
for param in self.patch_embed.parameters():
|
| 558 |
+
param.requires_grad = False
|
| 559 |
+
|
| 560 |
+
if self.frozen_stages >= 1 and self.ape:
|
| 561 |
+
self.absolute_pos_embed.requires_grad = False
|
| 562 |
+
|
| 563 |
+
if self.frozen_stages >= 2:
|
| 564 |
+
self.pos_drop.eval()
|
| 565 |
+
for i in range(0, self.frozen_stages - 1):
|
| 566 |
+
m = self.layers[i]
|
| 567 |
+
m.eval()
|
| 568 |
+
for param in m.parameters():
|
| 569 |
+
param.requires_grad = False
|
| 570 |
+
|
| 571 |
+
def init_weights(self, pretrained=None):
|
| 572 |
+
"""Initialize the weights in backbone.
|
| 573 |
+
Args:
|
| 574 |
+
pretrained (str, optional): Path to pre-trained weights.
|
| 575 |
+
Defaults to None.
|
| 576 |
+
"""
|
| 577 |
+
|
| 578 |
+
def _init_weights(m):
|
| 579 |
+
if isinstance(m, nn.Linear):
|
| 580 |
+
trunc_normal_(m.weight, std=.02)
|
| 581 |
+
if isinstance(m, nn.Linear) and m.bias is not None:
|
| 582 |
+
nn.init.constant_(m.bias, 0)
|
| 583 |
+
elif isinstance(m, nn.LayerNorm):
|
| 584 |
+
nn.init.constant_(m.bias, 0)
|
| 585 |
+
nn.init.constant_(m.weight, 1.0)
|
| 586 |
+
|
| 587 |
+
self.apply(_init_weights)
|
| 588 |
+
|
| 589 |
+
def forward(self, x):
|
| 590 |
+
"""Forward function."""
|
| 591 |
+
x = self.patch_embed(x)
|
| 592 |
+
|
| 593 |
+
Wh, Ww = x.size(2), x.size(3)
|
| 594 |
+
if self.ape:
|
| 595 |
+
# interpolate the position embedding to the corresponding size
|
| 596 |
+
absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
|
| 597 |
+
x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
|
| 598 |
+
else:
|
| 599 |
+
x = x.flatten(2).transpose(1, 2)
|
| 600 |
+
x = self.pos_drop(x)
|
| 601 |
+
|
| 602 |
+
outs = {}
|
| 603 |
+
for i in range(self.num_layers):
|
| 604 |
+
layer = self.layers[i]
|
| 605 |
+
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
|
| 606 |
+
name = f'stage{i+2}'
|
| 607 |
+
if name in self.out_features:
|
| 608 |
+
norm_layer = getattr(self, f'norm{i}')
|
| 609 |
+
x_out = norm_layer(x_out)
|
| 610 |
+
out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
|
| 611 |
+
outs[name] = out
|
| 612 |
+
|
| 613 |
+
return outs #{"stage%d" % (i+2,): out for i, out in enumerate(outs)} #tuple(outs)
|
| 614 |
+
|
| 615 |
+
def train(self, mode=True):
|
| 616 |
+
"""Convert the model into training mode while keep layers freezed."""
|
| 617 |
+
super(SwinTransformer, self).train(mode)
|
| 618 |
+
self._freeze_stages()
|
| 619 |
+
|
| 620 |
+
def output_shape(self):
|
| 621 |
+
return {
|
| 622 |
+
name: ShapeSpec(
|
| 623 |
+
channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
|
| 624 |
+
)
|
| 625 |
+
for name in self.out_features
|
| 626 |
+
}
|
| 627 |
+
|
| 628 |
+
@BACKBONE_REGISTRY.register()
|
| 629 |
+
def build_swin_backbone(cfg, input_shape):
|
| 630 |
+
"""
|
| 631 |
+
Create a SwinT instance from config.
|
| 632 |
+
Returns:
|
| 633 |
+
VoVNet: a :class:`VoVNet` instance.
|
| 634 |
+
"""
|
| 635 |
+
out_features = cfg.MODEL.SWINT.OUT_FEATURES
|
| 636 |
+
|
| 637 |
+
return SwinTransformer(
|
| 638 |
+
patch_size=cfg.MODEL.SWINT.PATCH_SIZE,
|
| 639 |
+
in_chans=input_shape.channels,
|
| 640 |
+
embed_dim=cfg.MODEL.SWINT.EMBED_DIM,
|
| 641 |
+
depths=cfg.MODEL.SWINT.DEPTHS,
|
| 642 |
+
num_heads=cfg.MODEL.SWINT.NUM_HEADS,
|
| 643 |
+
window_size=cfg.MODEL.SWINT.WINDOW_SIZE,
|
| 644 |
+
mlp_ratio=cfg.MODEL.SWINT.MLP_RATIO,
|
| 645 |
+
qkv_bias=True,
|
| 646 |
+
qk_scale=None,
|
| 647 |
+
drop_rate=0.,
|
| 648 |
+
attn_drop_rate=0.,
|
| 649 |
+
drop_path_rate=cfg.MODEL.SWINT.DROP_PATH_RATE,
|
| 650 |
+
norm_layer=nn.LayerNorm,
|
| 651 |
+
ape=cfg.MODEL.SWINT.APE,
|
| 652 |
+
patch_norm=True,
|
| 653 |
+
frozen_stages=cfg.MODEL.BACKBONE.FREEZE_AT,
|
| 654 |
+
out_features=out_features
|
| 655 |
+
)
|
| 656 |
+
|
| 657 |
+
|
| 658 |
+
@BACKBONE_REGISTRY.register()
|
| 659 |
+
def build_swin_fpn_backbone(cfg, input_shape: ShapeSpec):
|
| 660 |
+
"""
|
| 661 |
+
Args:
|
| 662 |
+
cfg: a detectron2 CfgNode
|
| 663 |
+
Returns:
|
| 664 |
+
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
|
| 665 |
+
"""
|
| 666 |
+
bottom_up = build_swin_backbone(cfg, input_shape)
|
| 667 |
+
in_features = cfg.MODEL.FPN.IN_FEATURES
|
| 668 |
+
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
|
| 669 |
+
backbone = FPN(
|
| 670 |
+
bottom_up=bottom_up,
|
| 671 |
+
in_features=in_features,
|
| 672 |
+
out_channels=out_channels,
|
| 673 |
+
norm=cfg.MODEL.FPN.NORM,
|
| 674 |
+
top_block=LastLevelMaxPool(),
|
| 675 |
+
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
|
| 676 |
+
)
|
| 677 |
+
return backbone
|
| 678 |
+
|
| 679 |
+
class LastLevelP6(nn.Module):
|
| 680 |
+
"""
|
| 681 |
+
This module is used in FCOS to generate extra layers
|
| 682 |
+
"""
|
| 683 |
+
|
| 684 |
+
def __init__(self, in_channels, out_channels, in_features="res5"):
|
| 685 |
+
super().__init__()
|
| 686 |
+
self.num_levels = 1
|
| 687 |
+
self.in_feature = in_features
|
| 688 |
+
self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
|
| 689 |
+
for module in [self.p6]:
|
| 690 |
+
weight_init.c2_xavier_fill(module)
|
| 691 |
+
|
| 692 |
+
def forward(self, x):
|
| 693 |
+
p6 = self.p6(x)
|
| 694 |
+
return [p6]
|
| 695 |
+
|
| 696 |
+
@BACKBONE_REGISTRY.register()
|
| 697 |
+
def build_retinanet_swin_fpn_backbone(cfg, input_shape: ShapeSpec):
|
| 698 |
+
"""
|
| 699 |
+
Args:
|
| 700 |
+
cfg: a detectron2 CfgNode
|
| 701 |
+
Returns:
|
| 702 |
+
backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
|
| 703 |
+
"""
|
| 704 |
+
bottom_up = build_swin_backbone(cfg, input_shape)
|
| 705 |
+
in_features = cfg.MODEL.FPN.IN_FEATURES
|
| 706 |
+
out_channels = cfg.MODEL.FPN.OUT_CHANNELS
|
| 707 |
+
top_levels = cfg.MODEL.FPN.TOP_LEVELS
|
| 708 |
+
in_channels_top = out_channels
|
| 709 |
+
if top_levels == 2:
|
| 710 |
+
top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
|
| 711 |
+
if top_levels == 1:
|
| 712 |
+
top_block = LastLevelP6(in_channels_top, out_channels, "p5")
|
| 713 |
+
elif top_levels == 0:
|
| 714 |
+
top_block = None
|
| 715 |
+
backbone = FPN(
|
| 716 |
+
bottom_up=bottom_up,
|
| 717 |
+
in_features=in_features,
|
| 718 |
+
out_channels=out_channels,
|
| 719 |
+
norm=cfg.MODEL.FPN.NORM,
|
| 720 |
+
top_block=top_block,
|
| 721 |
+
fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
|
| 722 |
+
)
|
| 723 |
+
return backbone
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/config.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from detectron2.config import CfgNode as CN
|
| 2 |
+
|
| 3 |
+
def add_entity_config(cfg):
|
| 4 |
+
"""
|
| 5 |
+
Add config for Item.
|
| 6 |
+
"""
|
| 7 |
+
## FCOS Hyper-Parameters
|
| 8 |
+
cfg.MODEL.FCOS = CN()
|
| 9 |
+
|
| 10 |
+
# Anchor parameters
|
| 11 |
+
cfg.MODEL.FCOS.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
|
| 12 |
+
cfg.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
|
| 13 |
+
cfg.MODEL.FCOS.NUM_CLASSES = 1
|
| 14 |
+
cfg.MODEL.FCOS.SIZES_OF_INTEREST = [[-1, 64], [64,128], [128,256], [256,512], [512, 100000000]]
|
| 15 |
+
|
| 16 |
+
# tower
|
| 17 |
+
cfg.MODEL.FCOS.NUM_CLS_CONVS = 4
|
| 18 |
+
cfg.MODEL.FCOS.NUM_BOX_CONVS = 4
|
| 19 |
+
cfg.MODEL.FCOS.NUM_SHARE_CONVS = 0
|
| 20 |
+
cfg.MODEL.FCOS.CENTER_SAMPLE = True
|
| 21 |
+
cfg.MODEL.FCOS.POS_RADIUS = 1.5
|
| 22 |
+
cfg.MODEL.FCOS.LOC_LOSS_TYPE = 'giou'
|
| 23 |
+
cfg.MODEL.FCOS.USE_RELU = True
|
| 24 |
+
cfg.MODEL.FCOS.USE_DEFORMABLE = False
|
| 25 |
+
cfg.MODEL.FCOS.USE_SCALE = True
|
| 26 |
+
cfg.MODEL.FCOS.TOP_LEVELS = 2
|
| 27 |
+
cfg.MODEL.FCOS.NORM = "GN"
|
| 28 |
+
|
| 29 |
+
# loss
|
| 30 |
+
cfg.MODEL.FCOS.PRIOR_PROB = 0.01
|
| 31 |
+
cfg.MODEL.FCOS.LOSS_ALPHA = 0.25
|
| 32 |
+
cfg.MODEL.FCOS.LOSS_GAMMA = 2.0
|
| 33 |
+
cfg.MODEL.FCOS.FB_RATIO = 4.0
|
| 34 |
+
cfg.MODEL.FCOS.CENTER_SAMPLE = True
|
| 35 |
+
cfg.MODEL.FCOS.YIELD_PROPOSAL = False
|
| 36 |
+
|
| 37 |
+
# inference
|
| 38 |
+
cfg.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.05
|
| 39 |
+
cfg.MODEL.FCOS.INFERENCE_TH_TEST = 0.05
|
| 40 |
+
cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN = 1000
|
| 41 |
+
cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST = 1000
|
| 42 |
+
cfg.MODEL.FCOS.NMS_TH = 0.6
|
| 43 |
+
cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN = 100
|
| 44 |
+
cfg.MODEL.FCOS.POST_NMS_TOPK_TEST = 100
|
| 45 |
+
cfg.MODEL.FCOS.THRESH_WITH_CTR = False
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
## CONDINST Hyper-Parameters
|
| 49 |
+
cfg.MODEL.CONDINST = CN()
|
| 50 |
+
# the downsampling ratio of the final instance masks to the input image
|
| 51 |
+
cfg.MODEL.CONDINST.MASK_OUT_STRIDE = 4
|
| 52 |
+
cfg.MODEL.CONDINST.MAX_PROPOSALS = 500
|
| 53 |
+
cfg.MODEL.CONDINST.TRAIN_MAX_PROPOSALS_PER_IMAGE = 120
|
| 54 |
+
cfg.MODEL.CONDINST.LOW_LEVEL_DIMENSION = 16
|
| 55 |
+
cfg.MODEL.CONDINST.CLASS_AGNOSTIC = False
|
| 56 |
+
|
| 57 |
+
cfg.MODEL.CONDINST.MASK_HEAD = CN()
|
| 58 |
+
cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS = 8
|
| 59 |
+
cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS = 3
|
| 60 |
+
cfg.MODEL.CONDINST.MASK_HEAD.USE_FP16 = False
|
| 61 |
+
cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS = False
|
| 62 |
+
cfg.MODEL.CONDINST.MASK_HEAD.CLUSTER_WEIGHT = 1.0
|
| 63 |
+
cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC = ["111", "110"]
|
| 64 |
+
cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC_WEIGHT = [1.0, 1.0]
|
| 65 |
+
|
| 66 |
+
cfg.MODEL.CONDINST.MASK_BRANCH = CN()
|
| 67 |
+
cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS = 8
|
| 68 |
+
cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES = ["p3", "p4", "p5"]
|
| 69 |
+
cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS = 128
|
| 70 |
+
cfg.MODEL.CONDINST.MASK_BRANCH.NORM = "BN"
|
| 71 |
+
cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS = 4
|
| 72 |
+
cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON = False
|
| 73 |
+
cfg.MODEL.CONDINST.MASK_BRANCH.USE_MASK_RESCORE = False
|
| 74 |
+
## kernel head
|
| 75 |
+
cfg.MODEL.KERNEL_HEAD = CN()
|
| 76 |
+
cfg.MODEL.KERNEL_HEAD.NUM_CONVS = 3
|
| 77 |
+
cfg.MODEL.KERNEL_HEAD.DEFORM = False
|
| 78 |
+
cfg.MODEL.KERNEL_HEAD.COORD = True
|
| 79 |
+
cfg.MODEL.KERNEL_HEAD.CONVS_DIM = 256
|
| 80 |
+
cfg.MODEL.KERNEL_HEAD.NORM = "GN"
|
| 81 |
+
|
| 82 |
+
## swin transformer backbone
|
| 83 |
+
cfg.MODEL.SWINT = CN()
|
| 84 |
+
cfg.MODEL.SWINT.EMBED_DIM = 96
|
| 85 |
+
cfg.MODEL.SWINT.PATCH_SIZE = 4
|
| 86 |
+
cfg.MODEL.SWINT.OUT_FEATURES = ["stage2", "stage3", "stage4", "stage5"]
|
| 87 |
+
cfg.MODEL.SWINT.DEPTHS = [2, 2, 6, 2]
|
| 88 |
+
cfg.MODEL.SWINT.NUM_HEADS = [3, 6, 12, 24]
|
| 89 |
+
cfg.MODEL.SWINT.WINDOW_SIZE = 7
|
| 90 |
+
cfg.MODEL.SWINT.MLP_RATIO = 4
|
| 91 |
+
cfg.MODEL.SWINT.DROP_PATH_RATE = 0.2
|
| 92 |
+
cfg.MODEL.SWINT.APE = False
|
| 93 |
+
|
| 94 |
+
# # addation
|
| 95 |
+
cfg.MODEL.FPN.TOP_LEVELS = 2
|
| 96 |
+
|
| 97 |
+
## mit former
|
| 98 |
+
cfg.MODEL.MIT_BACKBONE = CN()
|
| 99 |
+
cfg.MODEL.MIT_BACKBONE.NAME = "b0"
|
| 100 |
+
|
| 101 |
+
cfg.SOLVER.OPTIMIZER = "sgd"
|
| 102 |
+
cfg.TEST.CLASS_AGNOSTIC = True
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/__init__.py
ADDED
|
File without changes
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/detection.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
import torch
|
| 4 |
+
from torch import nn
|
| 5 |
+
from torch.nn import functional as F
|
| 6 |
+
|
| 7 |
+
from detectron2.structures import ImageList
|
| 8 |
+
from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
|
| 9 |
+
from detectron2.modeling.backbone import build_backbone
|
| 10 |
+
from detectron2.layers import ShapeSpec
|
| 11 |
+
from detectron2.modeling.postprocessing import detector_postprocess
|
| 12 |
+
|
| 13 |
+
from .layers import DFConv2d, IOULoss
|
| 14 |
+
# from .outputs_has_ignore import FCOSOutputs
|
| 15 |
+
from .outputs import FCOSOutputs
|
| 16 |
+
from .tower import FCOSHead
|
| 17 |
+
|
| 18 |
+
import pdb
|
| 19 |
+
import cv2
|
| 20 |
+
|
| 21 |
+
INF = 100000000
|
| 22 |
+
|
| 23 |
+
class FCOS(nn.Module):
|
| 24 |
+
def __init__(self, cfg, backbone_shape):
|
| 25 |
+
super().__init__()
|
| 26 |
+
|
| 27 |
+
self.device = torch.device(cfg.MODEL.DEVICE)
|
| 28 |
+
self.in_features = cfg.MODEL.FCOS.IN_FEATURES
|
| 29 |
+
self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
|
| 30 |
+
self.yield_proposal = cfg.MODEL.FCOS.YIELD_PROPOSAL
|
| 31 |
+
|
| 32 |
+
feature_shapes = [backbone_shape[f] for f in self.in_features]
|
| 33 |
+
self.fcos_head = FCOSHead(cfg, feature_shapes)
|
| 34 |
+
self.in_channels_to_top_module = self.fcos_head.in_channels_to_top_module
|
| 35 |
+
self.fcos_outputs = FCOSOutputs(cfg)
|
| 36 |
+
self.to(self.device)
|
| 37 |
+
|
| 38 |
+
def forward_head(self, features, top_module=None):
|
| 39 |
+
features = [features[f] for f in self.in_features]
|
| 40 |
+
pred_class_logits, pred_deltas, pred_centerness, bbox_towers, top_feats = self.fcos_head(features, top_module, self.yield_proposal)
|
| 41 |
+
return pred_class_logits, pred_deltas, pred_centerness, bbox_towers, top_feats
|
| 42 |
+
|
| 43 |
+
def forward(self, images, backbone_features, gt_instances, top_module=None):
|
| 44 |
+
"""
|
| 45 |
+
Arguments:
|
| 46 |
+
images (list[Tensor] or ImageList): images to be processed
|
| 47 |
+
targets (list[BoxList]): ground-truth boxes present in the image (optional)
|
| 48 |
+
Returns:
|
| 49 |
+
result (list[BoxList] or dict[Tensor]): the output from the model.
|
| 50 |
+
During training, it returns a dict[Tensor] which contains the losses.
|
| 51 |
+
During testing, it returns list[BoxList] contains additional fields
|
| 52 |
+
like `scores`, `labels` and `mask` (for Mask R-CNN models).
|
| 53 |
+
"""
|
| 54 |
+
features = [backbone_features[f] for f in self.in_features]
|
| 55 |
+
locations = self.compute_locations(features)
|
| 56 |
+
logits_pred, reg_pred, ctrness_pred, bbox_towers, top_feats = self.fcos_head(features, top_module)
|
| 57 |
+
|
| 58 |
+
results = {}
|
| 59 |
+
if self.yield_proposal:
|
| 60 |
+
results["features"] = {
|
| 61 |
+
f: b for f, b in zip(self.in_features, bbox_towers)
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
if self.training:
|
| 65 |
+
results, losses = self.fcos_outputs.losses(
|
| 66 |
+
logits_pred, reg_pred, ctrness_pred,
|
| 67 |
+
locations, gt_instances, top_feats
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
if self.yield_proposal:
|
| 71 |
+
with torch.no_grad():
|
| 72 |
+
results["proposals"] = self.fcos_outputs.predict_proposals(
|
| 73 |
+
logits_pred, reg_pred, ctrness_pred,
|
| 74 |
+
locations, images.image_sizes, top_feats
|
| 75 |
+
)
|
| 76 |
+
return results, losses
|
| 77 |
+
else:
|
| 78 |
+
results = self.fcos_outputs.predict_proposals(
|
| 79 |
+
logits_pred, reg_pred, ctrness_pred,
|
| 80 |
+
locations, images.image_sizes, top_feats
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
return results, {}
|
| 84 |
+
|
| 85 |
+
def compute_locations(self, features):
|
| 86 |
+
locations = []
|
| 87 |
+
for level, feature in enumerate(features):
|
| 88 |
+
h, w = feature.size()[-2:]
|
| 89 |
+
locations_per_level = self.compute_locations_per_level(
|
| 90 |
+
h, w, self.fpn_strides[level],
|
| 91 |
+
feature.device
|
| 92 |
+
)
|
| 93 |
+
locations.append(locations_per_level)
|
| 94 |
+
return locations
|
| 95 |
+
|
| 96 |
+
def compute_locations_per_level(self, h, w, stride, device):
|
| 97 |
+
shifts_x = torch.arange(
|
| 98 |
+
0, w * stride, step=stride,
|
| 99 |
+
dtype=torch.float32, device=device
|
| 100 |
+
)
|
| 101 |
+
shifts_y = torch.arange(
|
| 102 |
+
0, h * stride, step=stride,
|
| 103 |
+
dtype=torch.float32, device=device
|
| 104 |
+
)
|
| 105 |
+
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
|
| 106 |
+
shift_x = shift_x.reshape(-1)
|
| 107 |
+
shift_y = shift_y.reshape(-1)
|
| 108 |
+
locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
|
| 109 |
+
return locations
|
| 110 |
+
|
| 111 |
+
def build_det_head(cfg, backbone_shape):
|
| 112 |
+
return FCOS(cfg, backbone_shape)
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .deform_conv import DFConv2d
|
| 2 |
+
from .iou_loss import IOULoss
|
| 3 |
+
from .ml_nms import ml_nms
|
| 4 |
+
from .conv_with_kaiming_uniform import *
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/conv_with_kaiming_uniform.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch import nn
|
| 2 |
+
|
| 3 |
+
from detectron2.layers import Conv2d
|
| 4 |
+
from .deform_conv import DFConv2d
|
| 5 |
+
from detectron2.layers.batch_norm import get_norm
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def conv_with_kaiming_uniform(
|
| 9 |
+
norm=None, activation=None,
|
| 10 |
+
use_deformable=False, use_sep=False):
|
| 11 |
+
def make_conv(
|
| 12 |
+
in_channels, out_channels, kernel_size, stride=1, dilation=1
|
| 13 |
+
):
|
| 14 |
+
if use_deformable:
|
| 15 |
+
conv_func = DFConv2d
|
| 16 |
+
else:
|
| 17 |
+
conv_func = Conv2d
|
| 18 |
+
if use_sep:
|
| 19 |
+
assert in_channels == out_channels
|
| 20 |
+
groups = in_channels
|
| 21 |
+
else:
|
| 22 |
+
groups = 1
|
| 23 |
+
conv = conv_func(
|
| 24 |
+
in_channels,
|
| 25 |
+
out_channels,
|
| 26 |
+
kernel_size=kernel_size,
|
| 27 |
+
stride=stride,
|
| 28 |
+
padding=dilation * (kernel_size - 1) // 2,
|
| 29 |
+
dilation=dilation,
|
| 30 |
+
groups=groups,
|
| 31 |
+
bias=(norm is None)
|
| 32 |
+
)
|
| 33 |
+
if not use_deformable:
|
| 34 |
+
# Caffe2 implementation uses XavierFill, which in fact
|
| 35 |
+
# corresponds to kaiming_uniform_ in PyTorch
|
| 36 |
+
nn.init.kaiming_uniform_(conv.weight, a=1)
|
| 37 |
+
if norm is None:
|
| 38 |
+
nn.init.constant_(conv.bias, 0)
|
| 39 |
+
module = [conv,]
|
| 40 |
+
if norm is not None and len(norm) > 0:
|
| 41 |
+
if norm == "GN":
|
| 42 |
+
norm_module = nn.GroupNorm(32, out_channels)
|
| 43 |
+
else:
|
| 44 |
+
norm_module = get_norm(norm, out_channels)
|
| 45 |
+
module.append(norm_module)
|
| 46 |
+
if activation is not None:
|
| 47 |
+
module.append(nn.ReLU(inplace=True))
|
| 48 |
+
if len(module) > 1:
|
| 49 |
+
return nn.Sequential(*module)
|
| 50 |
+
return conv
|
| 51 |
+
|
| 52 |
+
return make_conv
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/deform_conv.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch import nn
|
| 3 |
+
|
| 4 |
+
from detectron2.layers import Conv2d
|
| 5 |
+
|
| 6 |
+
class _NewEmptyTensorOp(torch.autograd.Function):
|
| 7 |
+
@staticmethod
|
| 8 |
+
def forward(ctx, x, new_shape):
|
| 9 |
+
ctx.shape = x.shape
|
| 10 |
+
return x.new_empty(new_shape)
|
| 11 |
+
|
| 12 |
+
@staticmethod
|
| 13 |
+
def backward(ctx, grad):
|
| 14 |
+
shape = ctx.shape
|
| 15 |
+
return _NewEmptyTensorOp.apply(grad, shape), None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DFConv2d(nn.Module):
|
| 19 |
+
"""Deformable convolutional layer"""
|
| 20 |
+
def __init__(
|
| 21 |
+
self,
|
| 22 |
+
in_channels,
|
| 23 |
+
out_channels,
|
| 24 |
+
with_modulated_dcn=True,
|
| 25 |
+
kernel_size=3,
|
| 26 |
+
stride=1,
|
| 27 |
+
groups=1,
|
| 28 |
+
dilation=1,
|
| 29 |
+
deformable_groups=1,
|
| 30 |
+
bias=False,
|
| 31 |
+
padding=None
|
| 32 |
+
):
|
| 33 |
+
super(DFConv2d, self).__init__()
|
| 34 |
+
if isinstance(kernel_size, (list, tuple)):
|
| 35 |
+
assert isinstance(stride, (list, tuple))
|
| 36 |
+
assert isinstance(dilation, (list, tuple))
|
| 37 |
+
assert len(kernel_size) == 2
|
| 38 |
+
assert len(stride) == 2
|
| 39 |
+
assert len(dilation) == 2
|
| 40 |
+
padding = (
|
| 41 |
+
dilation[0] * (kernel_size[0] - 1) // 2,
|
| 42 |
+
dilation[1] * (kernel_size[1] - 1) // 2
|
| 43 |
+
)
|
| 44 |
+
offset_base_channels = kernel_size[0] * kernel_size[1]
|
| 45 |
+
else:
|
| 46 |
+
padding = dilation * (kernel_size - 1) // 2
|
| 47 |
+
offset_base_channels = kernel_size * kernel_size
|
| 48 |
+
if with_modulated_dcn:
|
| 49 |
+
from .deform_conv import ModulatedDeformConv
|
| 50 |
+
offset_channels = offset_base_channels * 3 # default: 27
|
| 51 |
+
conv_block = ModulatedDeformConv
|
| 52 |
+
else:
|
| 53 |
+
from .deform_conv import DeformConv
|
| 54 |
+
offset_channels = offset_base_channels * 2 # default: 18
|
| 55 |
+
conv_block = DeformConv
|
| 56 |
+
self.offset = Conv2d(
|
| 57 |
+
in_channels,
|
| 58 |
+
deformable_groups * offset_channels,
|
| 59 |
+
kernel_size=kernel_size,
|
| 60 |
+
stride=stride,
|
| 61 |
+
padding=padding,
|
| 62 |
+
groups=1,
|
| 63 |
+
dilation=dilation
|
| 64 |
+
)
|
| 65 |
+
for l in [self.offset, ]:
|
| 66 |
+
nn.init.kaiming_uniform_(l.weight, a=1)
|
| 67 |
+
torch.nn.init.constant_(l.bias, 0.)
|
| 68 |
+
self.conv = conv_block(
|
| 69 |
+
in_channels,
|
| 70 |
+
out_channels,
|
| 71 |
+
kernel_size=kernel_size,
|
| 72 |
+
stride=stride,
|
| 73 |
+
padding=padding,
|
| 74 |
+
dilation=dilation,
|
| 75 |
+
groups=groups,
|
| 76 |
+
deformable_groups=deformable_groups,
|
| 77 |
+
bias=bias
|
| 78 |
+
)
|
| 79 |
+
self.with_modulated_dcn = with_modulated_dcn
|
| 80 |
+
self.kernel_size = kernel_size
|
| 81 |
+
self.stride = stride
|
| 82 |
+
self.padding = padding
|
| 83 |
+
self.dilation = dilation
|
| 84 |
+
self.offset_split = offset_base_channels * deformable_groups * 2
|
| 85 |
+
|
| 86 |
+
def forward(self, x, return_offset=False):
|
| 87 |
+
if x.numel() > 0:
|
| 88 |
+
if not self.with_modulated_dcn:
|
| 89 |
+
offset_mask = self.offset(x)
|
| 90 |
+
x = self.conv(x, offset_mask)
|
| 91 |
+
else:
|
| 92 |
+
offset_mask = self.offset(x)
|
| 93 |
+
offset = offset_mask[:, :self.offset_split, :, :]
|
| 94 |
+
mask = offset_mask[:, self.offset_split:, :, :].sigmoid()
|
| 95 |
+
x = self.conv(x, offset, mask)
|
| 96 |
+
if return_offset:
|
| 97 |
+
return x, offset_mask
|
| 98 |
+
return x
|
| 99 |
+
# get output shape
|
| 100 |
+
output_shape = [
|
| 101 |
+
(i + 2 * p - (di * (k - 1) + 1)) // d + 1
|
| 102 |
+
for i, p, di, k, d in zip(
|
| 103 |
+
x.shape[-2:],
|
| 104 |
+
self.padding,
|
| 105 |
+
self.dilation,
|
| 106 |
+
self.kernel_size,
|
| 107 |
+
self.stride
|
| 108 |
+
)
|
| 109 |
+
]
|
| 110 |
+
output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape
|
| 111 |
+
return _NewEmptyTensorOp.apply(x, output_shape)
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/iou_loss.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch import nn
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class IOULoss(nn.Module):
|
| 6 |
+
def __init__(self, loc_loss_type='iou'):
|
| 7 |
+
super(IOULoss, self).__init__()
|
| 8 |
+
self.loc_loss_type = loc_loss_type
|
| 9 |
+
|
| 10 |
+
def forward(self, pred, target, weight=None):
|
| 11 |
+
pred_left = pred[:, 0]
|
| 12 |
+
pred_top = pred[:, 1]
|
| 13 |
+
pred_right = pred[:, 2]
|
| 14 |
+
pred_bottom = pred[:, 3]
|
| 15 |
+
|
| 16 |
+
target_left = target[:, 0]
|
| 17 |
+
target_top = target[:, 1]
|
| 18 |
+
target_right = target[:, 2]
|
| 19 |
+
target_bottom = target[:, 3]
|
| 20 |
+
|
| 21 |
+
target_aera = (target_left + target_right) * \
|
| 22 |
+
(target_top + target_bottom)
|
| 23 |
+
pred_aera = (pred_left + pred_right) * \
|
| 24 |
+
(pred_top + pred_bottom)
|
| 25 |
+
|
| 26 |
+
w_intersect = torch.min(pred_left, target_left) + \
|
| 27 |
+
torch.min(pred_right, target_right)
|
| 28 |
+
h_intersect = torch.min(pred_bottom, target_bottom) + \
|
| 29 |
+
torch.min(pred_top, target_top)
|
| 30 |
+
|
| 31 |
+
g_w_intersect = torch.max(pred_left, target_left) + \
|
| 32 |
+
torch.max(pred_right, target_right)
|
| 33 |
+
g_h_intersect = torch.max(pred_bottom, target_bottom) + \
|
| 34 |
+
torch.max(pred_top, target_top)
|
| 35 |
+
ac_uion = g_w_intersect * g_h_intersect
|
| 36 |
+
|
| 37 |
+
area_intersect = w_intersect * h_intersect
|
| 38 |
+
area_union = target_aera + pred_aera - area_intersect
|
| 39 |
+
|
| 40 |
+
ious = (area_intersect + 1.0) / (area_union + 1.0)
|
| 41 |
+
gious = ious - (ac_uion - area_union) / ac_uion
|
| 42 |
+
if self.loc_loss_type == 'iou':
|
| 43 |
+
losses = -torch.log(ious)
|
| 44 |
+
elif self.loc_loss_type == 'linear_iou':
|
| 45 |
+
losses = 1 - ious
|
| 46 |
+
elif self.loc_loss_type == 'giou':
|
| 47 |
+
losses = 1 - gious
|
| 48 |
+
else:
|
| 49 |
+
raise NotImplementedError
|
| 50 |
+
|
| 51 |
+
if weight is not None:
|
| 52 |
+
return (losses * weight).sum()
|
| 53 |
+
else:
|
| 54 |
+
return losses.sum()
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/ml_nms.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from detectron2.layers import batched_nms
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def ml_nms(boxlist, nms_thresh, max_proposals=-1,
|
| 5 |
+
score_field="scores", label_field="labels"):
|
| 6 |
+
"""
|
| 7 |
+
Performs non-maximum suppression on a boxlist, with scores specified
|
| 8 |
+
in a boxlist field via score_field.
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
boxlist (detectron2.structures.Boxes):
|
| 12 |
+
nms_thresh (float):
|
| 13 |
+
max_proposals (int): if > 0, then only the top max_proposals are kept
|
| 14 |
+
after non-maximum suppression
|
| 15 |
+
score_field (str):
|
| 16 |
+
"""
|
| 17 |
+
if nms_thresh <= 0:
|
| 18 |
+
return boxlist
|
| 19 |
+
boxes = boxlist.pred_boxes.tensor
|
| 20 |
+
scores = boxlist.scores
|
| 21 |
+
labels = boxlist.pred_classes
|
| 22 |
+
keep = batched_nms(boxes, scores, labels, nms_thresh)
|
| 23 |
+
if max_proposals > 0:
|
| 24 |
+
keep = keep[: max_proposals]
|
| 25 |
+
boxlist = boxlist[keep]
|
| 26 |
+
return boxlist
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/outputs.py
ADDED
|
@@ -0,0 +1,489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import torch
|
| 3 |
+
from torch import nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
|
| 6 |
+
from detectron2.layers import cat
|
| 7 |
+
from detectron2.structures import Instances, Boxes
|
| 8 |
+
from detectron2.utils.comm import get_world_size
|
| 9 |
+
from fvcore.nn import sigmoid_focal_loss_jit
|
| 10 |
+
|
| 11 |
+
from .utils import reduce_sum
|
| 12 |
+
from .layers import ml_nms, IOULoss
|
| 13 |
+
import pdb
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
INF = 100000000
|
| 18 |
+
|
| 19 |
+
def compute_ctrness_targets(reg_targets):
|
| 20 |
+
if len(reg_targets) == 0:
|
| 21 |
+
return reg_targets.new_zeros(len(reg_targets))
|
| 22 |
+
left_right = reg_targets[:, [0, 2]]
|
| 23 |
+
top_bottom = reg_targets[:, [1, 3]]
|
| 24 |
+
ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
|
| 25 |
+
(top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
|
| 26 |
+
return torch.sqrt(ctrness)
|
| 27 |
+
|
| 28 |
+
class FCOSOutputs(nn.Module):
|
| 29 |
+
def __init__(self, cfg):
|
| 30 |
+
super(FCOSOutputs, self).__init__()
|
| 31 |
+
|
| 32 |
+
self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA
|
| 33 |
+
self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA
|
| 34 |
+
self.center_sample = cfg.MODEL.FCOS.CENTER_SAMPLE
|
| 35 |
+
self.radius = cfg.MODEL.FCOS.POS_RADIUS
|
| 36 |
+
self.pre_nms_thresh_train = cfg.MODEL.FCOS.INFERENCE_TH_TRAIN
|
| 37 |
+
self.pre_nms_topk_train = cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN
|
| 38 |
+
self.post_nms_topk_train = cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN
|
| 39 |
+
self.loc_loss_func = IOULoss(cfg.MODEL.FCOS.LOC_LOSS_TYPE)
|
| 40 |
+
|
| 41 |
+
self.pre_nms_thresh_test = cfg.MODEL.FCOS.INFERENCE_TH_TEST
|
| 42 |
+
self.pre_nms_topk_test = cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST
|
| 43 |
+
self.post_nms_topk_test = cfg.MODEL.FCOS.POST_NMS_TOPK_TEST
|
| 44 |
+
self.nms_thresh = cfg.MODEL.FCOS.NMS_TH
|
| 45 |
+
self.thresh_with_ctr = cfg.MODEL.FCOS.THRESH_WITH_CTR
|
| 46 |
+
|
| 47 |
+
self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES
|
| 48 |
+
self.strides = cfg.MODEL.FCOS.FPN_STRIDES
|
| 49 |
+
|
| 50 |
+
self.sizes_of_interest = cfg.MODEL.FCOS.SIZES_OF_INTEREST
|
| 51 |
+
|
| 52 |
+
def _transpose(self, training_targets, num_loc_list):
|
| 53 |
+
'''
|
| 54 |
+
This function is used to transpose image first training targets to level first ones
|
| 55 |
+
:return: level first training targets
|
| 56 |
+
'''
|
| 57 |
+
for im_i in range(len(training_targets)):
|
| 58 |
+
training_targets[im_i] = torch.split(
|
| 59 |
+
training_targets[im_i], num_loc_list, dim=0
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
targets_level_first = []
|
| 63 |
+
for targets_per_level in zip(*training_targets):
|
| 64 |
+
targets_level_first.append(
|
| 65 |
+
torch.cat(targets_per_level, dim=0)
|
| 66 |
+
)
|
| 67 |
+
return targets_level_first
|
| 68 |
+
|
| 69 |
+
def _get_ground_truth(self, locations, gt_instances):
|
| 70 |
+
num_loc_list = [len(loc) for loc in locations]
|
| 71 |
+
|
| 72 |
+
# compute locations to size ranges
|
| 73 |
+
loc_to_size_range = []
|
| 74 |
+
for l, loc_per_level in enumerate(locations):
|
| 75 |
+
loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
|
| 76 |
+
loc_to_size_range.append(
|
| 77 |
+
loc_to_size_range_per_level[None].expand(num_loc_list[l], -1)
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
|
| 81 |
+
locations = torch.cat(locations, dim=0)
|
| 82 |
+
|
| 83 |
+
training_targets = self.compute_targets_for_locations(
|
| 84 |
+
locations, gt_instances, loc_to_size_range, num_loc_list
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
|
| 88 |
+
training_targets["im_inds"] = [locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))]
|
| 89 |
+
|
| 90 |
+
# transpose im first training_targets to level first ones
|
| 91 |
+
training_targets = {
|
| 92 |
+
k: self._transpose(v, num_loc_list) for k, v in training_targets.items()
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
training_targets["fpn_levels"] = [
|
| 96 |
+
loc.new_ones(len(loc), dtype=torch.long) * level
|
| 97 |
+
for level, loc in enumerate(training_targets["locations"])
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
# we normalize reg_targets by FPN's strides here
|
| 101 |
+
reg_targets = training_targets["reg_targets"]
|
| 102 |
+
for l in range(len(reg_targets)):
|
| 103 |
+
reg_targets[l] = reg_targets[l] / float(self.strides[l])
|
| 104 |
+
|
| 105 |
+
return training_targets
|
| 106 |
+
|
| 107 |
+
def get_sample_region(self, boxes, strides, num_loc_list, loc_xs, loc_ys, bitmasks=None, radius=1):
|
| 108 |
+
# pdb.set_trace()
|
| 109 |
+
if bitmasks is not None:
|
| 110 |
+
_, h, w = bitmasks.size()
|
| 111 |
+
|
| 112 |
+
ys = torch.arange(0, h, dtype=torch.float32, device=bitmasks.device)
|
| 113 |
+
xs = torch.arange(0, w, dtype=torch.float32, device=bitmasks.device)
|
| 114 |
+
|
| 115 |
+
m00 = bitmasks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6)
|
| 116 |
+
m10 = (bitmasks * xs).sum(dim=-1).sum(dim=-1)
|
| 117 |
+
m01 = (bitmasks * ys[:, None]).sum(dim=-1).sum(dim=-1)
|
| 118 |
+
center_x = m10 / m00
|
| 119 |
+
center_y = m01 / m00
|
| 120 |
+
center_x = center_x.float()
|
| 121 |
+
center_y = center_y.float()
|
| 122 |
+
else:
|
| 123 |
+
center_x = boxes[..., [0, 2]].sum(dim=-1) * 0.5
|
| 124 |
+
center_y = boxes[..., [1, 3]].sum(dim=-1) * 0.5
|
| 125 |
+
# pdb.set_trace()
|
| 126 |
+
num_gts = boxes.shape[0]
|
| 127 |
+
K = len(loc_xs)
|
| 128 |
+
boxes = boxes[None].expand(K, num_gts, 4)
|
| 129 |
+
center_x = center_x[None].expand(K, num_gts)
|
| 130 |
+
center_y = center_y[None].expand(K, num_gts)
|
| 131 |
+
center_gt = boxes.new_zeros(boxes.shape)
|
| 132 |
+
# no gt
|
| 133 |
+
if center_x.numel() == 0 or center_x[..., 0].sum() == 0:
|
| 134 |
+
return loc_xs.new_zeros(loc_xs.shape, dtype=torch.uint8)
|
| 135 |
+
beg = 0
|
| 136 |
+
for level, num_loc in enumerate(num_loc_list):
|
| 137 |
+
end = beg + num_loc
|
| 138 |
+
stride = strides[level] * radius
|
| 139 |
+
xmin = center_x[beg:end] - stride
|
| 140 |
+
ymin = center_y[beg:end] - stride
|
| 141 |
+
xmax = center_x[beg:end] + stride
|
| 142 |
+
ymax = center_y[beg:end] + stride
|
| 143 |
+
# limit sample region in gt
|
| 144 |
+
center_gt[beg:end, :, 0] = torch.where(xmin > boxes[beg:end, :, 0], xmin, boxes[beg:end, :, 0])
|
| 145 |
+
center_gt[beg:end, :, 1] = torch.where(ymin > boxes[beg:end, :, 1], ymin, boxes[beg:end, :, 1])
|
| 146 |
+
center_gt[beg:end, :, 2] = torch.where(xmax > boxes[beg:end, :, 2], boxes[beg:end, :, 2], xmax)
|
| 147 |
+
center_gt[beg:end, :, 3] = torch.where(ymax > boxes[beg:end, :, 3], boxes[beg:end, :, 3], ymax)
|
| 148 |
+
beg = end
|
| 149 |
+
left = loc_xs[:, None] - center_gt[..., 0]
|
| 150 |
+
right = center_gt[..., 2] - loc_xs[:, None]
|
| 151 |
+
top = loc_ys[:, None] - center_gt[..., 1]
|
| 152 |
+
bottom = center_gt[..., 3] - loc_ys[:, None]
|
| 153 |
+
center_bbox = torch.stack((left, top, right, bottom), -1)
|
| 154 |
+
inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
|
| 155 |
+
return inside_gt_bbox_mask
|
| 156 |
+
|
| 157 |
+
def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
|
| 158 |
+
labels = []
|
| 159 |
+
reg_targets = []
|
| 160 |
+
target_inds = []
|
| 161 |
+
xs, ys = locations[:, 0], locations[:, 1]
|
| 162 |
+
|
| 163 |
+
num_targets = 0
|
| 164 |
+
for im_i in range(len(targets)):
|
| 165 |
+
targets_per_im = targets[im_i]
|
| 166 |
+
bboxes = targets_per_im.gt_boxes.tensor
|
| 167 |
+
labels_per_im = targets_per_im.gt_classes
|
| 168 |
+
|
| 169 |
+
# no gt
|
| 170 |
+
if bboxes.numel() == 0:
|
| 171 |
+
labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
|
| 172 |
+
reg_targets.append(locations.new_zeros((locations.size(0), 4)))
|
| 173 |
+
target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
|
| 174 |
+
continue
|
| 175 |
+
|
| 176 |
+
area = targets_per_im.gt_boxes.area()
|
| 177 |
+
|
| 178 |
+
l = xs[:, None] - bboxes[:, 0][None]
|
| 179 |
+
t = ys[:, None] - bboxes[:, 1][None]
|
| 180 |
+
r = bboxes[:, 2][None] - xs[:, None]
|
| 181 |
+
b = bboxes[:, 3][None] - ys[:, None]
|
| 182 |
+
reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
|
| 183 |
+
|
| 184 |
+
if self.center_sample:
|
| 185 |
+
if targets_per_im.has("gt_bitmasks_full"):
|
| 186 |
+
bitmasks = targets_per_im.gt_bitmasks_full
|
| 187 |
+
else:
|
| 188 |
+
bitmasks = None
|
| 189 |
+
is_in_boxes = self.get_sample_region(
|
| 190 |
+
bboxes, self.strides, num_loc_list, xs, ys,
|
| 191 |
+
bitmasks=bitmasks, radius=self.radius
|
| 192 |
+
)
|
| 193 |
+
else:
|
| 194 |
+
is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0
|
| 195 |
+
|
| 196 |
+
max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0]
|
| 197 |
+
# limit the regression range for each location
|
| 198 |
+
is_cared_in_the_level = \
|
| 199 |
+
(max_reg_targets_per_im >= size_ranges[:, [0]]) & \
|
| 200 |
+
(max_reg_targets_per_im <= size_ranges[:, [1]])
|
| 201 |
+
|
| 202 |
+
locations_to_gt_area = area[None].repeat(len(locations), 1)
|
| 203 |
+
locations_to_gt_area[is_in_boxes == 0] = INF
|
| 204 |
+
locations_to_gt_area[is_cared_in_the_level == 0] = INF
|
| 205 |
+
|
| 206 |
+
# if there are still more than one objects for a location,
|
| 207 |
+
# we choose the one with minimal area
|
| 208 |
+
locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
|
| 209 |
+
|
| 210 |
+
reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds]
|
| 211 |
+
target_inds_per_im = locations_to_gt_inds + num_targets
|
| 212 |
+
num_targets += len(targets_per_im)
|
| 213 |
+
|
| 214 |
+
labels_per_im = labels_per_im[locations_to_gt_inds]
|
| 215 |
+
labels_per_im[locations_to_min_area == INF] = self.num_classes
|
| 216 |
+
|
| 217 |
+
labels.append(labels_per_im)
|
| 218 |
+
reg_targets.append(reg_targets_per_im)
|
| 219 |
+
target_inds.append(target_inds_per_im)
|
| 220 |
+
|
| 221 |
+
return {
|
| 222 |
+
"labels": labels,
|
| 223 |
+
"reg_targets": reg_targets,
|
| 224 |
+
"target_inds": target_inds
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
def losses(self, logits_pred, reg_pred, ctrness_pred, locations, gt_instances, top_feats=None):
|
| 228 |
+
"""
|
| 229 |
+
Return the losses from a set of FCOS predictions and their associated ground-truth.
|
| 230 |
+
|
| 231 |
+
Returns:
|
| 232 |
+
dict[loss name -> loss value]: A dict mapping from loss name to loss value.
|
| 233 |
+
"""
|
| 234 |
+
|
| 235 |
+
training_targets = self._get_ground_truth(locations, gt_instances)
|
| 236 |
+
|
| 237 |
+
# Collect all logits and regression predictions over feature maps
|
| 238 |
+
# and images to arrive at the same shape as the labels and targets
|
| 239 |
+
# The final ordering is L, N, H, W from slowest to fastest axis.
|
| 240 |
+
|
| 241 |
+
instances = Instances((0, 0))
|
| 242 |
+
instances.labels = cat([
|
| 243 |
+
# Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
|
| 244 |
+
x.reshape(-1) for x in training_targets["labels"]
|
| 245 |
+
], dim=0)
|
| 246 |
+
instances.gt_inds = cat([
|
| 247 |
+
# Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
|
| 248 |
+
x.reshape(-1) for x in training_targets["target_inds"]
|
| 249 |
+
], dim=0)
|
| 250 |
+
instances.im_inds = cat([
|
| 251 |
+
x.reshape(-1) for x in training_targets["im_inds"]
|
| 252 |
+
], dim=0)
|
| 253 |
+
instances.reg_targets = cat([
|
| 254 |
+
# Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4)
|
| 255 |
+
x.reshape(-1, 4) for x in training_targets["reg_targets"]
|
| 256 |
+
], dim=0,)
|
| 257 |
+
instances.locations = cat([
|
| 258 |
+
x.reshape(-1, 2) for x in training_targets["locations"]
|
| 259 |
+
], dim=0)
|
| 260 |
+
instances.fpn_levels = cat([
|
| 261 |
+
x.reshape(-1) for x in training_targets["fpn_levels"]
|
| 262 |
+
], dim=0)
|
| 263 |
+
|
| 264 |
+
instances.logits_pred = cat([
|
| 265 |
+
# Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C)
|
| 266 |
+
x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits_pred
|
| 267 |
+
], dim=0,)
|
| 268 |
+
instances.reg_pred = cat([
|
| 269 |
+
# Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B)
|
| 270 |
+
x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred
|
| 271 |
+
], dim=0,)
|
| 272 |
+
instances.ctrness_pred = cat([
|
| 273 |
+
# Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
|
| 274 |
+
x.permute(0, 2, 3, 1).reshape(-1) for x in ctrness_pred
|
| 275 |
+
], dim=0,)
|
| 276 |
+
|
| 277 |
+
if len(top_feats) > 0:
|
| 278 |
+
instances.top_feats = cat([
|
| 279 |
+
# Reshape: (N, -1, Hi, Wi) -> (N*Hi*Wi, -1)
|
| 280 |
+
x.permute(0, 2, 3, 1).reshape(-1, x.size(1)) for x in top_feats
|
| 281 |
+
], dim=0,)
|
| 282 |
+
|
| 283 |
+
return self.fcos_losses(instances)
|
| 284 |
+
|
| 285 |
+
def fcos_losses(self, instances):
|
| 286 |
+
num_classes = instances.logits_pred.size(1)
|
| 287 |
+
assert num_classes == self.num_classes
|
| 288 |
+
|
| 289 |
+
labels = instances.labels.flatten()
|
| 290 |
+
|
| 291 |
+
pos_inds = torch.nonzero(labels != num_classes).squeeze(1)
|
| 292 |
+
num_pos_local = pos_inds.numel()
|
| 293 |
+
num_gpus = get_world_size()
|
| 294 |
+
total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
|
| 295 |
+
num_pos_avg = max(total_num_pos / num_gpus, 1.0)
|
| 296 |
+
|
| 297 |
+
# prepare one_hot
|
| 298 |
+
class_target = torch.zeros_like(instances.logits_pred)
|
| 299 |
+
class_target[pos_inds, labels[pos_inds]] = 1
|
| 300 |
+
|
| 301 |
+
class_loss = sigmoid_focal_loss_jit(
|
| 302 |
+
instances.logits_pred,
|
| 303 |
+
class_target,
|
| 304 |
+
alpha=self.focal_loss_alpha,
|
| 305 |
+
gamma=self.focal_loss_gamma,
|
| 306 |
+
reduction="sum",
|
| 307 |
+
) / num_pos_avg
|
| 308 |
+
|
| 309 |
+
instances = instances[pos_inds]
|
| 310 |
+
instances.pos_inds = pos_inds
|
| 311 |
+
|
| 312 |
+
ctrness_targets = compute_ctrness_targets(instances.reg_targets)
|
| 313 |
+
ctrness_targets_sum = ctrness_targets.sum()
|
| 314 |
+
loss_denorm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
|
| 315 |
+
instances.gt_ctrs = ctrness_targets
|
| 316 |
+
|
| 317 |
+
if pos_inds.numel() > 0:
|
| 318 |
+
reg_loss = self.loc_loss_func(
|
| 319 |
+
instances.reg_pred,
|
| 320 |
+
instances.reg_targets,
|
| 321 |
+
ctrness_targets
|
| 322 |
+
) / loss_denorm
|
| 323 |
+
|
| 324 |
+
ctrness_loss = F.binary_cross_entropy_with_logits(
|
| 325 |
+
instances.ctrness_pred,
|
| 326 |
+
ctrness_targets,
|
| 327 |
+
reduction="sum"
|
| 328 |
+
) / num_pos_avg
|
| 329 |
+
else:
|
| 330 |
+
reg_loss = instances.reg_pred.sum() * 0
|
| 331 |
+
ctrness_loss = instances.ctrness_pred.sum() * 0
|
| 332 |
+
|
| 333 |
+
losses = {
|
| 334 |
+
"loss_fcos_cls": class_loss,
|
| 335 |
+
"loss_fcos_loc": reg_loss,
|
| 336 |
+
"loss_fcos_ctr": ctrness_loss
|
| 337 |
+
}
|
| 338 |
+
extras = {
|
| 339 |
+
"instances": instances,
|
| 340 |
+
"loss_denorm": loss_denorm
|
| 341 |
+
}
|
| 342 |
+
return extras, losses
|
| 343 |
+
|
| 344 |
+
def predict_proposals(
|
| 345 |
+
self, logits_pred, reg_pred, ctrness_pred,
|
| 346 |
+
locations, image_sizes, top_feats=None
|
| 347 |
+
):
|
| 348 |
+
if self.training:
|
| 349 |
+
self.pre_nms_thresh = self.pre_nms_thresh_train
|
| 350 |
+
self.pre_nms_topk = self.pre_nms_topk_train
|
| 351 |
+
self.post_nms_topk = self.post_nms_topk_train
|
| 352 |
+
else:
|
| 353 |
+
self.pre_nms_thresh = self.pre_nms_thresh_test
|
| 354 |
+
self.pre_nms_topk = self.pre_nms_topk_test
|
| 355 |
+
self.post_nms_topk = self.post_nms_topk_test
|
| 356 |
+
|
| 357 |
+
sampled_boxes = []
|
| 358 |
+
|
| 359 |
+
bundle = {
|
| 360 |
+
"l": locations, "o": logits_pred,
|
| 361 |
+
"r": reg_pred, "c": ctrness_pred,
|
| 362 |
+
"s": self.strides,
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
if len(top_feats) > 0:
|
| 366 |
+
bundle["t"] = top_feats
|
| 367 |
+
|
| 368 |
+
for i, per_bundle in enumerate(zip(*bundle.values())):
|
| 369 |
+
# get per-level bundle
|
| 370 |
+
per_bundle = dict(zip(bundle.keys(), per_bundle))
|
| 371 |
+
# recall that during training, we normalize regression targets with FPN's stride.
|
| 372 |
+
# we denormalize them here.
|
| 373 |
+
l = per_bundle["l"]
|
| 374 |
+
o = per_bundle["o"]
|
| 375 |
+
r = per_bundle["r"] * per_bundle["s"]
|
| 376 |
+
c = per_bundle["c"]
|
| 377 |
+
t = per_bundle["t"] if "t" in bundle else None
|
| 378 |
+
|
| 379 |
+
sampled_boxes.append(
|
| 380 |
+
self.forward_for_single_feature_map(
|
| 381 |
+
l, o, r, c, image_sizes, t
|
| 382 |
+
)
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
for per_im_sampled_boxes in sampled_boxes[-1]:
|
| 386 |
+
per_im_sampled_boxes.fpn_levels = l.new_ones(
|
| 387 |
+
len(per_im_sampled_boxes), dtype=torch.long
|
| 388 |
+
) * i
|
| 389 |
+
|
| 390 |
+
boxlists = list(zip(*sampled_boxes))
|
| 391 |
+
boxlists = [Instances.cat(boxlist) for boxlist in boxlists]
|
| 392 |
+
boxlists = self.select_over_all_levels(boxlists)
|
| 393 |
+
|
| 394 |
+
return boxlists
|
| 395 |
+
|
| 396 |
+
def forward_for_single_feature_map(
|
| 397 |
+
self, locations, logits_pred, reg_pred,
|
| 398 |
+
ctrness_pred, image_sizes, top_feat=None
|
| 399 |
+
):
|
| 400 |
+
N, C, H, W = logits_pred.shape
|
| 401 |
+
|
| 402 |
+
# put in the same format as locations
|
| 403 |
+
logits_pred = logits_pred.view(N, C, H, W).permute(0, 2, 3, 1)
|
| 404 |
+
logits_pred = logits_pred.reshape(N, -1, C).sigmoid()
|
| 405 |
+
box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1)
|
| 406 |
+
box_regression = box_regression.reshape(N, -1, 4)
|
| 407 |
+
ctrness_pred = ctrness_pred.view(N, 1, H, W).permute(0, 2, 3, 1)
|
| 408 |
+
ctrness_pred = ctrness_pred.reshape(N, -1).sigmoid()
|
| 409 |
+
if top_feat is not None:
|
| 410 |
+
top_feat = top_feat.view(N, -1, H, W).permute(0, 2, 3, 1)
|
| 411 |
+
top_feat = top_feat.reshape(N, H * W, -1)
|
| 412 |
+
|
| 413 |
+
# if self.thresh_with_ctr is True, we multiply the classification
|
| 414 |
+
# scores with centerness scores before applying the threshold.
|
| 415 |
+
if self.thresh_with_ctr:
|
| 416 |
+
logits_pred = logits_pred * ctrness_pred[:, :, None]
|
| 417 |
+
candidate_inds = logits_pred > self.pre_nms_thresh
|
| 418 |
+
pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
|
| 419 |
+
pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_topk)
|
| 420 |
+
|
| 421 |
+
if not self.thresh_with_ctr:
|
| 422 |
+
logits_pred = logits_pred * ctrness_pred[:, :, None]
|
| 423 |
+
|
| 424 |
+
results = []
|
| 425 |
+
for i in range(N):
|
| 426 |
+
per_box_cls = logits_pred[i]
|
| 427 |
+
per_candidate_inds = candidate_inds[i]
|
| 428 |
+
per_box_cls = per_box_cls[per_candidate_inds]
|
| 429 |
+
|
| 430 |
+
per_candidate_nonzeros = per_candidate_inds.nonzero()
|
| 431 |
+
per_box_loc = per_candidate_nonzeros[:, 0]
|
| 432 |
+
per_class = per_candidate_nonzeros[:, 1]
|
| 433 |
+
|
| 434 |
+
per_box_regression = box_regression[i]
|
| 435 |
+
per_box_regression = per_box_regression[per_box_loc]
|
| 436 |
+
per_locations = locations[per_box_loc]
|
| 437 |
+
if top_feat is not None:
|
| 438 |
+
per_top_feat = top_feat[i]
|
| 439 |
+
per_top_feat = per_top_feat[per_box_loc]
|
| 440 |
+
|
| 441 |
+
per_pre_nms_top_n = pre_nms_top_n[i]
|
| 442 |
+
|
| 443 |
+
if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
|
| 444 |
+
per_box_cls, top_k_indices = \
|
| 445 |
+
per_box_cls.topk(per_pre_nms_top_n, sorted=False)
|
| 446 |
+
per_class = per_class[top_k_indices]
|
| 447 |
+
per_box_regression = per_box_regression[top_k_indices]
|
| 448 |
+
per_locations = per_locations[top_k_indices]
|
| 449 |
+
if top_feat is not None:
|
| 450 |
+
per_top_feat = per_top_feat[top_k_indices]
|
| 451 |
+
|
| 452 |
+
detections = torch.stack([
|
| 453 |
+
per_locations[:, 0] - per_box_regression[:, 0],
|
| 454 |
+
per_locations[:, 1] - per_box_regression[:, 1],
|
| 455 |
+
per_locations[:, 0] + per_box_regression[:, 2],
|
| 456 |
+
per_locations[:, 1] + per_box_regression[:, 3],
|
| 457 |
+
], dim=1)
|
| 458 |
+
|
| 459 |
+
boxlist = Instances(image_sizes[i])
|
| 460 |
+
boxlist.pred_boxes = Boxes(detections)
|
| 461 |
+
boxlist.scores = torch.sqrt(per_box_cls)
|
| 462 |
+
boxlist.pred_classes = per_class
|
| 463 |
+
boxlist.locations = per_locations
|
| 464 |
+
if top_feat is not None:
|
| 465 |
+
boxlist.top_feat = per_top_feat
|
| 466 |
+
results.append(boxlist)
|
| 467 |
+
|
| 468 |
+
return results
|
| 469 |
+
|
| 470 |
+
def select_over_all_levels(self, boxlists):
|
| 471 |
+
num_images = len(boxlists)
|
| 472 |
+
results = []
|
| 473 |
+
for i in range(num_images):
|
| 474 |
+
# multiclass nms
|
| 475 |
+
result = ml_nms(boxlists[i], self.nms_thresh)
|
| 476 |
+
number_of_detections = len(result)
|
| 477 |
+
|
| 478 |
+
# Limit to max_per_image detections **over all classes**
|
| 479 |
+
if number_of_detections > self.post_nms_topk > 0:
|
| 480 |
+
cls_scores = result.scores
|
| 481 |
+
image_thresh, _ = torch.kthvalue(
|
| 482 |
+
cls_scores.cpu(),
|
| 483 |
+
number_of_detections - self.post_nms_topk + 1
|
| 484 |
+
)
|
| 485 |
+
keep = cls_scores >= image_thresh.item()
|
| 486 |
+
keep = torch.nonzero(keep).squeeze(1)
|
| 487 |
+
result = result[keep]
|
| 488 |
+
results.append(result)
|
| 489 |
+
return results
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/tower.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
import torch
|
| 4 |
+
from torch import nn
|
| 5 |
+
from torch.nn import functional as F
|
| 6 |
+
|
| 7 |
+
from detectron2.layers import ShapeSpec
|
| 8 |
+
|
| 9 |
+
from .layers import DFConv2d, IOULoss
|
| 10 |
+
|
| 11 |
+
class Scale(nn.Module):
|
| 12 |
+
def __init__(self, init_value=1.0):
|
| 13 |
+
super(Scale, self).__init__()
|
| 14 |
+
self.scale = nn.Parameter(torch.FloatTensor([init_value]))
|
| 15 |
+
|
| 16 |
+
def forward(self, input):
|
| 17 |
+
return input * self.scale
|
| 18 |
+
|
| 19 |
+
class FCOSHead(nn.Module):
|
| 20 |
+
def __init__(self, cfg, input_shape: List[ShapeSpec]):
|
| 21 |
+
"""
|
| 22 |
+
Arguments:
|
| 23 |
+
in_channels (int): number of channels of the input feature
|
| 24 |
+
"""
|
| 25 |
+
super().__init__()
|
| 26 |
+
self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES
|
| 27 |
+
self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
|
| 28 |
+
head_configs = {"cls": (cfg.MODEL.FCOS.NUM_CLS_CONVS, False),
|
| 29 |
+
"bbox": (cfg.MODEL.FCOS.NUM_BOX_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE),
|
| 30 |
+
"share": (cfg.MODEL.FCOS.NUM_SHARE_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE)}
|
| 31 |
+
norm = None if cfg.MODEL.FCOS.NORM == "none" else cfg.MODEL.FCOS.NORM
|
| 32 |
+
|
| 33 |
+
in_channels = [s.channels for s in input_shape]
|
| 34 |
+
assert len(set(in_channels)) == 1, "Each level must have the same channel!"
|
| 35 |
+
in_channels = in_channels[0]
|
| 36 |
+
|
| 37 |
+
self.in_channels_to_top_module = in_channels
|
| 38 |
+
|
| 39 |
+
for head in head_configs:
|
| 40 |
+
tower = []
|
| 41 |
+
num_convs, use_deformable = head_configs[head]
|
| 42 |
+
if use_deformable:
|
| 43 |
+
conv_func = DFConv2d
|
| 44 |
+
else:
|
| 45 |
+
conv_func = nn.Conv2d
|
| 46 |
+
for i in range(num_convs):
|
| 47 |
+
tower.append(conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True))
|
| 48 |
+
if norm == "GN":
|
| 49 |
+
tower.append(nn.GroupNorm(32, in_channels))
|
| 50 |
+
tower.append(nn.ReLU())
|
| 51 |
+
self.add_module('{}_tower'.format(head), nn.Sequential(*tower))
|
| 52 |
+
|
| 53 |
+
self.cls_logits = nn.Conv2d(in_channels, self.num_classes, kernel_size=3, stride=1, padding=1)
|
| 54 |
+
self.bbox_pred = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1, bias=False)
|
| 55 |
+
self.ctrness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=False)
|
| 56 |
+
|
| 57 |
+
if cfg.MODEL.FCOS.USE_SCALE:
|
| 58 |
+
self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in self.fpn_strides])
|
| 59 |
+
else:
|
| 60 |
+
self.scales = None
|
| 61 |
+
|
| 62 |
+
for modules in [self.cls_tower, self.bbox_tower, self.share_tower, self.cls_logits]:
|
| 63 |
+
for l in modules.modules():
|
| 64 |
+
if isinstance(l, nn.Conv2d):
|
| 65 |
+
torch.nn.init.normal_(l.weight, std=0.01)
|
| 66 |
+
torch.nn.init.constant_(l.bias, 0)
|
| 67 |
+
|
| 68 |
+
for modules in [self.bbox_pred, self.ctrness]:
|
| 69 |
+
for l in modules.modules():
|
| 70 |
+
if isinstance(l, nn.Conv2d):
|
| 71 |
+
torch.nn.init.normal_(l.weight, std=0.01)
|
| 72 |
+
|
| 73 |
+
# initialize the bias for focal loss
|
| 74 |
+
prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
|
| 75 |
+
bias_value = -math.log((1 - prior_prob) / prior_prob)
|
| 76 |
+
torch.nn.init.constant_(self.cls_logits.bias, bias_value)
|
| 77 |
+
|
| 78 |
+
def forward(self, x, top_module=None):
|
| 79 |
+
logits = []
|
| 80 |
+
bbox_reg = []
|
| 81 |
+
ctrness = []
|
| 82 |
+
top_feats = []
|
| 83 |
+
bbox_towers = []
|
| 84 |
+
for l, feature in enumerate(x):
|
| 85 |
+
feature = self.share_tower(feature)
|
| 86 |
+
cls_tower = self.cls_tower(feature)
|
| 87 |
+
bbox_tower = self.bbox_tower(feature)
|
| 88 |
+
|
| 89 |
+
logits.append(self.cls_logits(cls_tower))
|
| 90 |
+
ctrness.append(self.ctrness(bbox_tower))
|
| 91 |
+
reg = self.bbox_pred(bbox_tower)
|
| 92 |
+
if self.scales is not None:
|
| 93 |
+
reg = self.scales[l](reg)
|
| 94 |
+
# Note that we use relu, as in the improved FCOS, instead of exp.
|
| 95 |
+
bbox_reg.append(F.relu(reg))
|
| 96 |
+
|
| 97 |
+
if top_module is not None:
|
| 98 |
+
top_feats.append(top_module(bbox_tower))
|
| 99 |
+
|
| 100 |
+
return logits, bbox_reg, ctrness, bbox_towers, top_feats
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .comm import reduce_sum
|
| 2 |
+
from .measures import *
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/comm.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn.functional as F
|
| 3 |
+
import torch.distributed as dist
|
| 4 |
+
from detectron2.utils.comm import get_world_size
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def reduce_sum(tensor):
|
| 8 |
+
world_size = get_world_size()
|
| 9 |
+
if world_size < 2:
|
| 10 |
+
return tensor
|
| 11 |
+
tensor = tensor.clone()
|
| 12 |
+
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
|
| 13 |
+
return tensor
|
| 14 |
+
|
| 15 |
+
def aligned_bilinear(tensor, factor):
|
| 16 |
+
assert tensor.dim() == 4
|
| 17 |
+
assert factor >= 1
|
| 18 |
+
assert int(factor) == factor
|
| 19 |
+
|
| 20 |
+
if factor == 1:
|
| 21 |
+
return tensor
|
| 22 |
+
|
| 23 |
+
h, w = tensor.size()[2:]
|
| 24 |
+
tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
|
| 25 |
+
oh = factor * h + 1
|
| 26 |
+
ow = factor * w + 1
|
| 27 |
+
tensor = F.interpolate(
|
| 28 |
+
tensor, size=(oh, ow),
|
| 29 |
+
mode='bilinear',
|
| 30 |
+
align_corners=True
|
| 31 |
+
)
|
| 32 |
+
tensor = F.pad(
|
| 33 |
+
tensor, pad=(factor // 2, 0, factor // 2, 0),
|
| 34 |
+
mode="replicate"
|
| 35 |
+
)
|
| 36 |
+
return tensor[:, :, :oh - 1, :ow - 1]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def compute_locations(h, w, stride, device):
|
| 40 |
+
shifts_x = torch.arange(
|
| 41 |
+
0, w * stride, step=stride,
|
| 42 |
+
dtype=torch.float32, device=device
|
| 43 |
+
)
|
| 44 |
+
shifts_y = torch.arange(
|
| 45 |
+
0, h * stride, step=stride,
|
| 46 |
+
dtype=torch.float32, device=device
|
| 47 |
+
)
|
| 48 |
+
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
|
| 49 |
+
shift_x = shift_x.reshape(-1)
|
| 50 |
+
shift_y = shift_y.reshape(-1)
|
| 51 |
+
locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
|
| 52 |
+
return locations
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/measures.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding: utf-8
|
| 2 |
+
# Adapted from https://github.com/ShichenLiu/CondenseNet/blob/master/utils.py
|
| 3 |
+
from __future__ import absolute_import
|
| 4 |
+
from __future__ import unicode_literals
|
| 5 |
+
from __future__ import print_function
|
| 6 |
+
from __future__ import division
|
| 7 |
+
|
| 8 |
+
import operator
|
| 9 |
+
|
| 10 |
+
from functools import reduce
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def get_num_gen(gen):
|
| 14 |
+
return sum(1 for x in gen)
|
| 15 |
+
|
| 16 |
+
def is_pruned(layer):
|
| 17 |
+
try:
|
| 18 |
+
layer.mask
|
| 19 |
+
return True
|
| 20 |
+
except AttributeError:
|
| 21 |
+
return False
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def is_leaf(model):
|
| 25 |
+
return get_num_gen(model.children()) == 0
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def get_layer_info(layer):
|
| 29 |
+
layer_str = str(layer)
|
| 30 |
+
type_name = layer_str[:layer_str.find('(')].strip()
|
| 31 |
+
return type_name
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_layer_param(model):
|
| 35 |
+
return sum([reduce(operator.mul, i.size(), 1) for i in model.parameters()])
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
### The input batch size should be 1 to call this function
|
| 39 |
+
def measure_layer(layer, *args):
|
| 40 |
+
global count_ops, count_params
|
| 41 |
+
|
| 42 |
+
for x in args:
|
| 43 |
+
delta_ops = 0
|
| 44 |
+
delta_params = 0
|
| 45 |
+
multi_add = 1
|
| 46 |
+
type_name = get_layer_info(layer)
|
| 47 |
+
|
| 48 |
+
### ops_conv
|
| 49 |
+
if type_name in ['Conv2d']:
|
| 50 |
+
out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0] - layer.kernel_size[0]) /
|
| 51 |
+
layer.stride[0] + 1)
|
| 52 |
+
out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1] - layer.kernel_size[1]) /
|
| 53 |
+
layer.stride[1] + 1)
|
| 54 |
+
delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
|
| 55 |
+
delta_params = get_layer_param(layer)
|
| 56 |
+
|
| 57 |
+
elif type_name in ['ConvTranspose2d']:
|
| 58 |
+
_, _, in_h, in_w = x.size()
|
| 59 |
+
out_h = int((in_h-1)*layer.stride[0] - 2 * layer.padding[0] + layer.kernel_size[0] + layer.output_padding[0])
|
| 60 |
+
out_w = int((in_w-1)*layer.stride[1] - 2 * layer.padding[1] + layer.kernel_size[1] + layer.output_padding[1])
|
| 61 |
+
delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * \
|
| 62 |
+
layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
|
| 63 |
+
delta_params = get_layer_param(layer)
|
| 64 |
+
|
| 65 |
+
### ops_learned_conv
|
| 66 |
+
elif type_name in ['LearnedGroupConv']:
|
| 67 |
+
measure_layer(layer.relu, x)
|
| 68 |
+
measure_layer(layer.norm, x)
|
| 69 |
+
conv = layer.conv
|
| 70 |
+
out_h = int((x.size()[2] + 2 * conv.padding[0] - conv.kernel_size[0]) /
|
| 71 |
+
conv.stride[0] + 1)
|
| 72 |
+
out_w = int((x.size()[3] + 2 * conv.padding[1] - conv.kernel_size[1]) /
|
| 73 |
+
conv.stride[1] + 1)
|
| 74 |
+
delta_ops = conv.in_channels * conv.out_channels * conv.kernel_size[0] * conv.kernel_size[1] * out_h * out_w / layer.condense_factor * multi_add
|
| 75 |
+
delta_params = get_layer_param(conv) / layer.condense_factor
|
| 76 |
+
|
| 77 |
+
### ops_nonlinearity
|
| 78 |
+
elif type_name in ['ReLU', 'ReLU6']:
|
| 79 |
+
delta_ops = x.numel()
|
| 80 |
+
delta_params = get_layer_param(layer)
|
| 81 |
+
|
| 82 |
+
### ops_pooling
|
| 83 |
+
elif type_name in ['AvgPool2d', 'MaxPool2d']:
|
| 84 |
+
in_w = x.size()[2]
|
| 85 |
+
kernel_ops = layer.kernel_size * layer.kernel_size
|
| 86 |
+
out_w = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
|
| 87 |
+
out_h = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
|
| 88 |
+
delta_ops = x.size()[0] * x.size()[1] * out_w * out_h * kernel_ops
|
| 89 |
+
delta_params = get_layer_param(layer)
|
| 90 |
+
|
| 91 |
+
elif type_name in ['LastLevelMaxPool']:
|
| 92 |
+
pass
|
| 93 |
+
|
| 94 |
+
elif type_name in ['AdaptiveAvgPool2d']:
|
| 95 |
+
delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
|
| 96 |
+
delta_params = get_layer_param(layer)
|
| 97 |
+
|
| 98 |
+
elif type_name in ['ZeroPad2d', 'RetinaNetPostProcessor']:
|
| 99 |
+
pass
|
| 100 |
+
#delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
|
| 101 |
+
#delta_params = get_layer_param(layer)
|
| 102 |
+
|
| 103 |
+
### ops_linear
|
| 104 |
+
elif type_name in ['Linear']:
|
| 105 |
+
weight_ops = layer.weight.numel() * multi_add
|
| 106 |
+
bias_ops = layer.bias.numel()
|
| 107 |
+
delta_ops = x.size()[0] * (weight_ops + bias_ops)
|
| 108 |
+
delta_params = get_layer_param(layer)
|
| 109 |
+
|
| 110 |
+
### ops_nothing
|
| 111 |
+
elif type_name in ['BatchNorm2d', 'Dropout2d', 'DropChannel', 'Dropout', 'FrozenBatchNorm2d', 'GroupNorm']:
|
| 112 |
+
delta_params = get_layer_param(layer)
|
| 113 |
+
|
| 114 |
+
elif type_name in ['SumTwo']:
|
| 115 |
+
delta_ops = x.numel()
|
| 116 |
+
|
| 117 |
+
elif type_name in ['AggregateCell']:
|
| 118 |
+
if not layer.pre_transform:
|
| 119 |
+
delta_ops = 2 * x.numel() # twice for each input
|
| 120 |
+
else:
|
| 121 |
+
measure_layer(layer.branch_1, x)
|
| 122 |
+
measure_layer(layer.branch_2, x)
|
| 123 |
+
delta_params = get_layer_param(layer)
|
| 124 |
+
|
| 125 |
+
elif type_name in ['Identity', 'Zero']:
|
| 126 |
+
pass
|
| 127 |
+
|
| 128 |
+
elif type_name in ['Scale']:
|
| 129 |
+
delta_params = get_layer_param(layer)
|
| 130 |
+
delta_ops = x.numel()
|
| 131 |
+
|
| 132 |
+
elif type_name in ['FCOSPostProcessor', 'RPNPostProcessor', 'KeypointPostProcessor',
|
| 133 |
+
'ROIAlign', 'PostProcessor', 'KeypointRCNNPredictor',
|
| 134 |
+
'NaiveSyncBatchNorm', 'Upsample', 'Sequential']:
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
elif type_name in ['DeformConv']:
|
| 138 |
+
# don't count bilinear
|
| 139 |
+
offset_conv = list(layer.parameters())[0]
|
| 140 |
+
delta_ops = reduce(operator.mul, offset_conv.size(), x.size()[2] * x.size()[3])
|
| 141 |
+
out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0]
|
| 142 |
+
- layer.kernel_size[0]) / layer.stride[0] + 1)
|
| 143 |
+
out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1]
|
| 144 |
+
- layer.kernel_size[1]) / layer.stride[1] + 1)
|
| 145 |
+
delta_ops += layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
|
| 146 |
+
delta_params = get_layer_param(layer)
|
| 147 |
+
|
| 148 |
+
### unknown layer type
|
| 149 |
+
else:
|
| 150 |
+
raise TypeError('unknown layer type: %s' % type_name)
|
| 151 |
+
|
| 152 |
+
count_ops += delta_ops
|
| 153 |
+
count_params += delta_params
|
| 154 |
+
return
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def measure_model(model, x):
|
| 158 |
+
global count_ops, count_params
|
| 159 |
+
count_ops = 0
|
| 160 |
+
count_params = 0
|
| 161 |
+
|
| 162 |
+
def should_measure(x):
|
| 163 |
+
return is_leaf(x) or is_pruned(x)
|
| 164 |
+
|
| 165 |
+
def modify_forward(model):
|
| 166 |
+
for child in model.children():
|
| 167 |
+
if should_measure(child):
|
| 168 |
+
def new_forward(m):
|
| 169 |
+
def lambda_forward(*args):
|
| 170 |
+
measure_layer(m, *args)
|
| 171 |
+
return m.old_forward(*args)
|
| 172 |
+
return lambda_forward
|
| 173 |
+
child.old_forward = child.forward
|
| 174 |
+
child.forward = new_forward(child)
|
| 175 |
+
else:
|
| 176 |
+
modify_forward(child)
|
| 177 |
+
|
| 178 |
+
def restore_forward(model):
|
| 179 |
+
for child in model.children():
|
| 180 |
+
# leaf node
|
| 181 |
+
if is_leaf(child) and hasattr(child, 'old_forward'):
|
| 182 |
+
child.forward = child.old_forward
|
| 183 |
+
child.old_forward = None
|
| 184 |
+
else:
|
| 185 |
+
restore_forward(child)
|
| 186 |
+
|
| 187 |
+
modify_forward(model)
|
| 188 |
+
out = model.forward(x)
|
| 189 |
+
restore_forward(model)
|
| 190 |
+
|
| 191 |
+
return out, count_ops, count_params
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/__init__.py
ADDED
|
File without changes
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/entity_evaluation.py
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
| 2 |
+
import contextlib
|
| 3 |
+
import copy
|
| 4 |
+
import io
|
| 5 |
+
import itertools
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import numpy as np
|
| 9 |
+
import os
|
| 10 |
+
import pickle
|
| 11 |
+
from collections import OrderedDict
|
| 12 |
+
import pycocotools.mask as mask_util
|
| 13 |
+
import torch
|
| 14 |
+
from fvcore.common.file_io import PathManager
|
| 15 |
+
from pycocotools.coco import COCO
|
| 16 |
+
from tabulate import tabulate
|
| 17 |
+
|
| 18 |
+
import detectron2.utils.comm as comm
|
| 19 |
+
from detectron2.data import MetadataCatalog
|
| 20 |
+
from detectron2.data.datasets.coco import convert_to_coco_json
|
| 21 |
+
from detectron2.evaluation.evaluator import DatasetEvaluator
|
| 22 |
+
from detectron2.evaluation.fast_eval_api import COCOeval_opt as COCOeval
|
| 23 |
+
from detectron2.structures import Boxes, BoxMode, pairwise_iou
|
| 24 |
+
from detectron2.utils.logger import create_small_table
|
| 25 |
+
import pdb
|
| 26 |
+
|
| 27 |
+
class COCOEvaluator_ClassAgnostic(DatasetEvaluator):
|
| 28 |
+
"""
|
| 29 |
+
Evaluate AR for object proposals, AP for instance detection/segmentation, AP
|
| 30 |
+
for keypoint detection outputs using COCO's metrics.
|
| 31 |
+
See http://cocodataset.org/#detection-eval and
|
| 32 |
+
http://cocodataset.org/#keypoints-eval to understand its metrics.
|
| 33 |
+
|
| 34 |
+
In addition to COCO, this evaluator is able to support any bounding box detection,
|
| 35 |
+
instance segmentation, or keypoint detection dataset.
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(self, dataset_name, cfg, distributed, output_dir=None):
|
| 39 |
+
"""
|
| 40 |
+
Args:
|
| 41 |
+
dataset_name (str): name of the dataset to be evaluated.
|
| 42 |
+
It must have either the following corresponding metadata:
|
| 43 |
+
|
| 44 |
+
"json_file": the path to the COCO format annotation
|
| 45 |
+
|
| 46 |
+
Or it must be in detectron2's standard dataset format
|
| 47 |
+
so it can be converted to COCO format automatically.
|
| 48 |
+
cfg (CfgNode): config instance
|
| 49 |
+
distributed (True): if True, will collect results from all ranks and run evaluation
|
| 50 |
+
in the main process.
|
| 51 |
+
Otherwise, will evaluate the results in the current process.
|
| 52 |
+
output_dir (str): optional, an output directory to dump all
|
| 53 |
+
results predicted on the dataset. The dump contains two files:
|
| 54 |
+
|
| 55 |
+
1. "instance_predictions.pth" a file in torch serialization
|
| 56 |
+
format that contains all the raw original predictions.
|
| 57 |
+
2. "coco_instances_results.json" a json file in COCO's result
|
| 58 |
+
format.
|
| 59 |
+
"""
|
| 60 |
+
self._tasks = self._tasks_from_config(cfg)
|
| 61 |
+
self._distributed = distributed
|
| 62 |
+
self._output_dir = output_dir
|
| 63 |
+
|
| 64 |
+
self._cpu_device = torch.device("cpu")
|
| 65 |
+
self._logger = logging.getLogger(__name__)
|
| 66 |
+
|
| 67 |
+
self._metadata = MetadataCatalog.get(dataset_name)
|
| 68 |
+
if not hasattr(self._metadata, "json_file"):
|
| 69 |
+
self._logger.info(
|
| 70 |
+
f"'{dataset_name}' is not registered by `register_coco_instances`."
|
| 71 |
+
" Therefore trying to convert it to COCO format ..."
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json")
|
| 75 |
+
self._metadata.json_file = cache_path
|
| 76 |
+
convert_to_coco_json(dataset_name, cache_path)
|
| 77 |
+
|
| 78 |
+
# pdb.set_trace()
|
| 79 |
+
# if self._metadata.select:
|
| 80 |
+
# self._metadata.json_file = os.path.join("individual", self._metadata.json_file.split(".")[0]+"_{}.json".format(self._metadata.select))
|
| 81 |
+
json_file = PathManager.get_local_path(self._metadata.json_file)
|
| 82 |
+
with contextlib.redirect_stdout(io.StringIO()):
|
| 83 |
+
self._coco_api = COCO(json_file, cfg.TEST.CLASS_AGNOSTIC)
|
| 84 |
+
|
| 85 |
+
self._kpt_oks_sigmas = cfg.TEST.KEYPOINT_OKS_SIGMAS
|
| 86 |
+
# Test set json files do not contain annotations (evaluation must be
|
| 87 |
+
# performed using the COCO evaluation server).
|
| 88 |
+
self._do_evaluation = "annotations" in self._coco_api.dataset
|
| 89 |
+
|
| 90 |
+
def reset(self):
|
| 91 |
+
self._predictions = []
|
| 92 |
+
|
| 93 |
+
def _tasks_from_config(self, cfg):
|
| 94 |
+
"""
|
| 95 |
+
Returns:
|
| 96 |
+
tuple[str]: tasks that can be evaluated under the given configuration.
|
| 97 |
+
"""
|
| 98 |
+
tasks = ("bbox",)
|
| 99 |
+
if cfg.MODEL.MASK_ON:
|
| 100 |
+
tasks = tasks + ("segm",)
|
| 101 |
+
if cfg.MODEL.KEYPOINT_ON:
|
| 102 |
+
tasks = tasks + ("keypoints",)
|
| 103 |
+
return tasks
|
| 104 |
+
|
| 105 |
+
def process(self, inputs, outputs):
|
| 106 |
+
"""
|
| 107 |
+
Args:
|
| 108 |
+
inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
|
| 109 |
+
It is a list of dict. Each dict corresponds to an image and
|
| 110 |
+
contains keys like "height", "width", "file_name", "image_id".
|
| 111 |
+
outputs: the outputs of a COCO model. It is a list of dicts with key
|
| 112 |
+
"instances" that contains :class:`Instances`.
|
| 113 |
+
"""
|
| 114 |
+
for input, output in zip(inputs, outputs):
|
| 115 |
+
prediction = {"image_id": input["image_id"]}
|
| 116 |
+
|
| 117 |
+
# TODO this is ugly
|
| 118 |
+
if "instances" in output:
|
| 119 |
+
instances = output["instances"].to(self._cpu_device)
|
| 120 |
+
prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
|
| 121 |
+
if "proposals" in output:
|
| 122 |
+
prediction["proposals"] = output["proposals"].to(self._cpu_device)
|
| 123 |
+
self._predictions.append(prediction)
|
| 124 |
+
|
| 125 |
+
def evaluate(self):
|
| 126 |
+
if self._distributed:
|
| 127 |
+
comm.synchronize()
|
| 128 |
+
predictions = comm.gather(self._predictions, dst=0)
|
| 129 |
+
predictions = list(itertools.chain(*predictions))
|
| 130 |
+
|
| 131 |
+
if not comm.is_main_process():
|
| 132 |
+
return {}
|
| 133 |
+
else:
|
| 134 |
+
predictions = self._predictions
|
| 135 |
+
|
| 136 |
+
if len(predictions) == 0:
|
| 137 |
+
self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
|
| 138 |
+
return {}
|
| 139 |
+
|
| 140 |
+
if self._output_dir:
|
| 141 |
+
PathManager.mkdirs(self._output_dir)
|
| 142 |
+
file_path = os.path.join(self._output_dir, "instances_predictions.pth")
|
| 143 |
+
with PathManager.open(file_path, "wb") as f:
|
| 144 |
+
torch.save(predictions, f)
|
| 145 |
+
|
| 146 |
+
self._results = OrderedDict()
|
| 147 |
+
if "proposals" in predictions[0]:
|
| 148 |
+
self._eval_box_proposals(predictions)
|
| 149 |
+
if "instances" in predictions[0]:
|
| 150 |
+
self._eval_predictions(set(self._tasks), predictions)
|
| 151 |
+
# Copy so the caller can do whatever with results
|
| 152 |
+
return copy.deepcopy(self._results)
|
| 153 |
+
|
| 154 |
+
def _eval_predictions(self, tasks, predictions):
|
| 155 |
+
"""
|
| 156 |
+
Evaluate predictions on the given tasks.
|
| 157 |
+
Fill self._results with the metrics of the tasks.
|
| 158 |
+
"""
|
| 159 |
+
self._logger.info("Preparing results for COCO format ...")
|
| 160 |
+
coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
|
| 161 |
+
|
| 162 |
+
# unmap the category ids for COCO
|
| 163 |
+
# if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
|
| 164 |
+
# reverse_id_mapping = {
|
| 165 |
+
# v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
|
| 166 |
+
# }
|
| 167 |
+
for result in coco_results:
|
| 168 |
+
result["category_id"] = 1
|
| 169 |
+
|
| 170 |
+
if self._output_dir:
|
| 171 |
+
file_path = os.path.join(self._output_dir, "coco_instances_results.json")
|
| 172 |
+
self._logger.info("Saving results to {}".format(file_path))
|
| 173 |
+
with PathManager.open(file_path, "w") as f:
|
| 174 |
+
f.write(json.dumps(coco_results))
|
| 175 |
+
f.flush()
|
| 176 |
+
|
| 177 |
+
if not self._do_evaluation:
|
| 178 |
+
self._logger.info("Annotations are not available for evaluation.")
|
| 179 |
+
return
|
| 180 |
+
|
| 181 |
+
self._logger.info("Evaluating predictions ...")
|
| 182 |
+
if "segmentation" in coco_results[0]:
|
| 183 |
+
tasks = ["bbox", "segm"]
|
| 184 |
+
else:
|
| 185 |
+
task = ["bbox"]
|
| 186 |
+
for task in sorted(tasks):
|
| 187 |
+
coco_eval = (
|
| 188 |
+
_evaluate_predictions_on_coco(
|
| 189 |
+
self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas
|
| 190 |
+
)
|
| 191 |
+
if len(coco_results) > 0
|
| 192 |
+
else None # cocoapi does not handle empty results very well
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
res = self._derive_coco_results(
|
| 196 |
+
coco_eval, task
|
| 197 |
+
)
|
| 198 |
+
self._results[task] = res
|
| 199 |
+
|
| 200 |
+
def _eval_box_proposals(self, predictions):
|
| 201 |
+
"""
|
| 202 |
+
Evaluate the box proposals in predictions.
|
| 203 |
+
Fill self._results with the metrics for "box_proposals" task.
|
| 204 |
+
"""
|
| 205 |
+
if self._output_dir:
|
| 206 |
+
# Saving generated box proposals to file.
|
| 207 |
+
# Predicted box_proposals are in XYXY_ABS mode.
|
| 208 |
+
bbox_mode = BoxMode.XYXY_ABS.value
|
| 209 |
+
ids, boxes, objectness_logits = [], [], []
|
| 210 |
+
for prediction in predictions:
|
| 211 |
+
ids.append(prediction["image_id"])
|
| 212 |
+
boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
|
| 213 |
+
objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
|
| 214 |
+
|
| 215 |
+
proposal_data = {
|
| 216 |
+
"boxes": boxes,
|
| 217 |
+
"objectness_logits": objectness_logits,
|
| 218 |
+
"ids": ids,
|
| 219 |
+
"bbox_mode": bbox_mode,
|
| 220 |
+
}
|
| 221 |
+
with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
|
| 222 |
+
pickle.dump(proposal_data, f)
|
| 223 |
+
|
| 224 |
+
if not self._do_evaluation:
|
| 225 |
+
self._logger.info("Annotations are not available for evaluation.")
|
| 226 |
+
return
|
| 227 |
+
|
| 228 |
+
self._logger.info("Evaluating bbox proposals ...")
|
| 229 |
+
res = {}
|
| 230 |
+
areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
|
| 231 |
+
for limit in [100, 1000]:
|
| 232 |
+
for area, suffix in areas.items():
|
| 233 |
+
stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit)
|
| 234 |
+
key = "AR{}@{:d}".format(suffix, limit)
|
| 235 |
+
res[key] = float(stats["ar"].item() * 100)
|
| 236 |
+
self._logger.info("Proposal metrics: \n" + create_small_table(res))
|
| 237 |
+
self._results["box_proposals"] = res
|
| 238 |
+
|
| 239 |
+
def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
|
| 240 |
+
"""
|
| 241 |
+
Derive the desired score numbers from summarized COCOeval.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
coco_eval (None or COCOEval): None represents no predictions from model.
|
| 245 |
+
iou_type (str):
|
| 246 |
+
class_names (None or list[str]): if provided, will use it to predict
|
| 247 |
+
per-category AP.
|
| 248 |
+
|
| 249 |
+
Returns:
|
| 250 |
+
a dict of {metric name: score}
|
| 251 |
+
"""
|
| 252 |
+
|
| 253 |
+
metrics = {
|
| 254 |
+
"bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
|
| 255 |
+
"segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
|
| 256 |
+
"keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
|
| 257 |
+
}[iou_type]
|
| 258 |
+
|
| 259 |
+
if coco_eval is None:
|
| 260 |
+
self._logger.warn("No predictions from the model!")
|
| 261 |
+
return {metric: float("nan") for metric in metrics}
|
| 262 |
+
|
| 263 |
+
# the standard metrics
|
| 264 |
+
results = {
|
| 265 |
+
metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
|
| 266 |
+
for idx, metric in enumerate(metrics)
|
| 267 |
+
}
|
| 268 |
+
self._logger.info(
|
| 269 |
+
"Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
|
| 270 |
+
)
|
| 271 |
+
if not np.isfinite(sum(results.values())):
|
| 272 |
+
self._logger.info("Some metrics cannot be computed and is shown as NaN.")
|
| 273 |
+
|
| 274 |
+
if class_names is None or len(class_names) <= 1:
|
| 275 |
+
return results
|
| 276 |
+
# Compute per-category AP
|
| 277 |
+
# from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
|
| 278 |
+
precisions = coco_eval.eval["precision"]
|
| 279 |
+
# precision has dims (iou, recall, cls, area range, max dets)
|
| 280 |
+
assert len(class_names) == precisions.shape[2]
|
| 281 |
+
|
| 282 |
+
results_per_category = []
|
| 283 |
+
for idx, name in enumerate(class_names):
|
| 284 |
+
# area range index 0: all area ranges
|
| 285 |
+
# max dets index -1: typically 100 per image
|
| 286 |
+
precision = precisions[:, :, idx, 0, -1]
|
| 287 |
+
precision = precision[precision > -1]
|
| 288 |
+
ap = np.mean(precision) if precision.size else float("nan")
|
| 289 |
+
results_per_category.append(("{}".format(name), float(ap * 100)))
|
| 290 |
+
|
| 291 |
+
# tabulate it
|
| 292 |
+
N_COLS = min(6, len(results_per_category) * 2)
|
| 293 |
+
results_flatten = list(itertools.chain(*results_per_category))
|
| 294 |
+
results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
|
| 295 |
+
table = tabulate(
|
| 296 |
+
results_2d,
|
| 297 |
+
tablefmt="pipe",
|
| 298 |
+
floatfmt=".3f",
|
| 299 |
+
headers=["category", "AP"] * (N_COLS // 2),
|
| 300 |
+
numalign="left",
|
| 301 |
+
)
|
| 302 |
+
self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
|
| 303 |
+
|
| 304 |
+
results.update({"AP-" + name: ap for name, ap in results_per_category})
|
| 305 |
+
return results
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def instances_to_coco_json(instances, img_id):
|
| 309 |
+
"""
|
| 310 |
+
Dump an "Instances" object to a COCO-format json that's used for evaluation.
|
| 311 |
+
|
| 312 |
+
Args:
|
| 313 |
+
instances (Instances):
|
| 314 |
+
img_id (int): the image id
|
| 315 |
+
|
| 316 |
+
Returns:
|
| 317 |
+
list[dict]: list of json annotations in COCO format.
|
| 318 |
+
"""
|
| 319 |
+
num_instance = len(instances)
|
| 320 |
+
if num_instance == 0:
|
| 321 |
+
return []
|
| 322 |
+
|
| 323 |
+
boxes = instances.pred_boxes.tensor.numpy()
|
| 324 |
+
boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
|
| 325 |
+
boxes = boxes.tolist()
|
| 326 |
+
scores = instances.scores.tolist()
|
| 327 |
+
classes = instances.pred_classes.tolist()
|
| 328 |
+
|
| 329 |
+
has_mask = instances.has("pred_masks")
|
| 330 |
+
if has_mask:
|
| 331 |
+
# use RLE to encode the masks, because they are too large and takes memory
|
| 332 |
+
# since this evaluator stores outputs of the entire dataset
|
| 333 |
+
rles = [
|
| 334 |
+
mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
|
| 335 |
+
for mask in instances.pred_masks
|
| 336 |
+
]
|
| 337 |
+
for rle in rles:
|
| 338 |
+
# "counts" is an array encoded by mask_util as a byte-stream. Python3's
|
| 339 |
+
# json writer which always produces strings cannot serialize a bytestream
|
| 340 |
+
# unless you decode it. Thankfully, utf-8 works out (which is also what
|
| 341 |
+
# the pycocotools/_mask.pyx does).
|
| 342 |
+
rle["counts"] = rle["counts"].decode("utf-8")
|
| 343 |
+
|
| 344 |
+
has_keypoints = instances.has("pred_keypoints")
|
| 345 |
+
if has_keypoints:
|
| 346 |
+
keypoints = instances.pred_keypoints
|
| 347 |
+
|
| 348 |
+
results = []
|
| 349 |
+
for k in range(num_instance):
|
| 350 |
+
result = {
|
| 351 |
+
"image_id": img_id,
|
| 352 |
+
"category_id": classes[k],
|
| 353 |
+
"bbox": boxes[k],
|
| 354 |
+
"score": scores[k],
|
| 355 |
+
}
|
| 356 |
+
if has_mask:
|
| 357 |
+
result["segmentation"] = rles[k]
|
| 358 |
+
if has_keypoints:
|
| 359 |
+
# In COCO annotations,
|
| 360 |
+
# keypoints coordinates are pixel indices.
|
| 361 |
+
# However our predictions are floating point coordinates.
|
| 362 |
+
# Therefore we subtract 0.5 to be consistent with the annotation format.
|
| 363 |
+
# This is the inverse of data loading logic in `datasets/coco.py`.
|
| 364 |
+
keypoints[k][:, :2] -= 0.5
|
| 365 |
+
result["keypoints"] = keypoints[k].flatten().tolist()
|
| 366 |
+
results.append(result)
|
| 367 |
+
return results
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
# inspired from Detectron:
|
| 371 |
+
# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
|
| 372 |
+
def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None):
|
| 373 |
+
"""
|
| 374 |
+
Evaluate detection proposal recall metrics. This function is a much
|
| 375 |
+
faster alternative to the official COCO API recall evaluation code. However,
|
| 376 |
+
it produces slightly different results.
|
| 377 |
+
"""
|
| 378 |
+
# Record max overlap value for each gt box
|
| 379 |
+
# Return vector of overlap values
|
| 380 |
+
areas = {
|
| 381 |
+
"all": 0,
|
| 382 |
+
"small": 1,
|
| 383 |
+
"medium": 2,
|
| 384 |
+
"large": 3,
|
| 385 |
+
"96-128": 4,
|
| 386 |
+
"128-256": 5,
|
| 387 |
+
"256-512": 6,
|
| 388 |
+
"512-inf": 7,
|
| 389 |
+
}
|
| 390 |
+
area_ranges = [
|
| 391 |
+
[0 ** 2, 1e5 ** 2], # all
|
| 392 |
+
[0 ** 2, 32 ** 2], # small
|
| 393 |
+
[32 ** 2, 96 ** 2], # medium
|
| 394 |
+
[96 ** 2, 1e5 ** 2], # large
|
| 395 |
+
[96 ** 2, 128 ** 2], # 96-128
|
| 396 |
+
[128 ** 2, 256 ** 2], # 128-256
|
| 397 |
+
[256 ** 2, 512 ** 2], # 256-512
|
| 398 |
+
[512 ** 2, 1e5 ** 2],
|
| 399 |
+
] # 512-inf
|
| 400 |
+
assert area in areas, "Unknown area range: {}".format(area)
|
| 401 |
+
area_range = area_ranges[areas[area]]
|
| 402 |
+
gt_overlaps = []
|
| 403 |
+
num_pos = 0
|
| 404 |
+
|
| 405 |
+
for prediction_dict in dataset_predictions:
|
| 406 |
+
predictions = prediction_dict["proposals"]
|
| 407 |
+
|
| 408 |
+
# sort predictions in descending order
|
| 409 |
+
# TODO maybe remove this and make it explicit in the documentation
|
| 410 |
+
inds = predictions.objectness_logits.sort(descending=True)[1]
|
| 411 |
+
predictions = predictions[inds]
|
| 412 |
+
|
| 413 |
+
ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"])
|
| 414 |
+
anno = coco_api.loadAnns(ann_ids)
|
| 415 |
+
gt_boxes = [
|
| 416 |
+
BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
|
| 417 |
+
for obj in anno
|
| 418 |
+
if obj["iscrowd"] == 0
|
| 419 |
+
]
|
| 420 |
+
gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes
|
| 421 |
+
gt_boxes = Boxes(gt_boxes)
|
| 422 |
+
gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
|
| 423 |
+
|
| 424 |
+
if len(gt_boxes) == 0 or len(predictions) == 0:
|
| 425 |
+
continue
|
| 426 |
+
|
| 427 |
+
valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
|
| 428 |
+
gt_boxes = gt_boxes[valid_gt_inds]
|
| 429 |
+
|
| 430 |
+
num_pos += len(gt_boxes)
|
| 431 |
+
|
| 432 |
+
if len(gt_boxes) == 0:
|
| 433 |
+
continue
|
| 434 |
+
|
| 435 |
+
if limit is not None and len(predictions) > limit:
|
| 436 |
+
predictions = predictions[:limit]
|
| 437 |
+
|
| 438 |
+
overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
|
| 439 |
+
|
| 440 |
+
_gt_overlaps = torch.zeros(len(gt_boxes))
|
| 441 |
+
for j in range(min(len(predictions), len(gt_boxes))):
|
| 442 |
+
# find which proposal box maximally covers each gt box
|
| 443 |
+
# and get the iou amount of coverage for each gt box
|
| 444 |
+
max_overlaps, argmax_overlaps = overlaps.max(dim=0)
|
| 445 |
+
|
| 446 |
+
# find which gt box is 'best' covered (i.e. 'best' = most iou)
|
| 447 |
+
gt_ovr, gt_ind = max_overlaps.max(dim=0)
|
| 448 |
+
assert gt_ovr >= 0
|
| 449 |
+
# find the proposal box that covers the best covered gt box
|
| 450 |
+
box_ind = argmax_overlaps[gt_ind]
|
| 451 |
+
# record the iou coverage of this gt box
|
| 452 |
+
_gt_overlaps[j] = overlaps[box_ind, gt_ind]
|
| 453 |
+
assert _gt_overlaps[j] == gt_ovr
|
| 454 |
+
# mark the proposal box and the gt box as used
|
| 455 |
+
overlaps[box_ind, :] = -1
|
| 456 |
+
overlaps[:, gt_ind] = -1
|
| 457 |
+
|
| 458 |
+
# append recorded iou coverage level
|
| 459 |
+
gt_overlaps.append(_gt_overlaps)
|
| 460 |
+
gt_overlaps = (
|
| 461 |
+
torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
|
| 462 |
+
)
|
| 463 |
+
gt_overlaps, _ = torch.sort(gt_overlaps)
|
| 464 |
+
|
| 465 |
+
if thresholds is None:
|
| 466 |
+
step = 0.05
|
| 467 |
+
thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
|
| 468 |
+
recalls = torch.zeros_like(thresholds)
|
| 469 |
+
# compute recall for each iou threshold
|
| 470 |
+
for i, t in enumerate(thresholds):
|
| 471 |
+
recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
|
| 472 |
+
# ar = 2 * np.trapz(recalls, thresholds)
|
| 473 |
+
ar = recalls.mean()
|
| 474 |
+
return {
|
| 475 |
+
"ar": ar,
|
| 476 |
+
"recalls": recalls,
|
| 477 |
+
"thresholds": thresholds,
|
| 478 |
+
"gt_overlaps": gt_overlaps,
|
| 479 |
+
"num_pos": num_pos,
|
| 480 |
+
}
|
| 481 |
+
|
| 482 |
+
|
| 483 |
+
def _evaluate_predictions_on_coco(coco_gt, coco_results, iou_type, kpt_oks_sigmas=None):
|
| 484 |
+
"""
|
| 485 |
+
Evaluate the coco results using COCOEval API.
|
| 486 |
+
"""
|
| 487 |
+
assert len(coco_results) > 0
|
| 488 |
+
|
| 489 |
+
if iou_type == "segm":
|
| 490 |
+
coco_results = copy.deepcopy(coco_results)
|
| 491 |
+
# When evaluating mask AP, if the results contain bbox, cocoapi will
|
| 492 |
+
# use the box area as the area of the instance, instead of the mask area.
|
| 493 |
+
# This leads to a different definition of small/medium/large.
|
| 494 |
+
# We remove the bbox field to let mask AP use mask area.
|
| 495 |
+
for c in coco_results:
|
| 496 |
+
c.pop("bbox", None)
|
| 497 |
+
|
| 498 |
+
coco_dt = coco_gt.loadRes(coco_results)
|
| 499 |
+
coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
|
| 500 |
+
|
| 501 |
+
if iou_type == "keypoints":
|
| 502 |
+
# Use the COCO default keypoint OKS sigmas unless overrides are specified
|
| 503 |
+
if kpt_oks_sigmas:
|
| 504 |
+
assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "pycocotools is too old!"
|
| 505 |
+
coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas)
|
| 506 |
+
# COCOAPI requires every detection and every gt to have keypoints, so
|
| 507 |
+
# we just take the first entry from both
|
| 508 |
+
num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3
|
| 509 |
+
num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3
|
| 510 |
+
num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas)
|
| 511 |
+
assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, (
|
| 512 |
+
f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. "
|
| 513 |
+
f"Ground truth contains {num_keypoints_gt} keypoints. "
|
| 514 |
+
f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. "
|
| 515 |
+
"They have to agree with each other. For meaning of OKS, please refer to "
|
| 516 |
+
"http://cocodataset.org/#keypoints-eval."
|
| 517 |
+
)
|
| 518 |
+
|
| 519 |
+
coco_eval.evaluate()
|
| 520 |
+
coco_eval.accumulate()
|
| 521 |
+
coco_eval.summarize()
|
| 522 |
+
|
| 523 |
+
return coco_eval
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .mask_branch import build_mask_branch
|
| 2 |
+
from .dynamic_mask_head import build_dynamic_mask_head
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/dynamic_mask_head.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch.nn import functional as F
|
| 3 |
+
from torch import nn
|
| 4 |
+
|
| 5 |
+
from ..det_head.utils.comm import compute_locations, aligned_bilinear
|
| 6 |
+
from fvcore.nn import sigmoid_focal_loss_jit
|
| 7 |
+
from .utils import sigmoid_focal_loss_boundary, sigmoid_focal_loss_boundary_jit
|
| 8 |
+
import pdb
|
| 9 |
+
|
| 10 |
+
def dice_coefficient(x, target):
|
| 11 |
+
eps = 1e-5
|
| 12 |
+
n_inst = x.size(0)
|
| 13 |
+
x = x.reshape(n_inst, -1)
|
| 14 |
+
target = target.reshape(n_inst, -1)
|
| 15 |
+
intersection = (x * target).sum(dim=1)
|
| 16 |
+
union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
|
| 17 |
+
loss = 1. - (2 * intersection / union)
|
| 18 |
+
return loss
|
| 19 |
+
|
| 20 |
+
def parse_dynamic_params(params, channels, weight_nums, bias_nums):
|
| 21 |
+
assert params.dim() == 2
|
| 22 |
+
assert len(weight_nums) == len(bias_nums)
|
| 23 |
+
assert params.size(1) == sum(weight_nums) + sum(bias_nums)
|
| 24 |
+
|
| 25 |
+
num_insts = params.size(0)
|
| 26 |
+
num_layers = len(weight_nums)
|
| 27 |
+
|
| 28 |
+
params_splits = list(torch.split_with_sizes(params, weight_nums + bias_nums, dim=1))
|
| 29 |
+
|
| 30 |
+
weight_splits = params_splits[:num_layers]
|
| 31 |
+
bias_splits = params_splits[num_layers:]
|
| 32 |
+
|
| 33 |
+
for l in range(num_layers):
|
| 34 |
+
if l < num_layers - 1:
|
| 35 |
+
# out_channels x in_channels x 1 x 1
|
| 36 |
+
weight_splits[l] = weight_splits[l].reshape(num_insts * channels, -1, 1, 1)
|
| 37 |
+
bias_splits[l] = bias_splits[l].reshape(num_insts * channels)
|
| 38 |
+
else:
|
| 39 |
+
# out_channels x in_channels x 1 x 1
|
| 40 |
+
weight_splits[l] = weight_splits[l].reshape(num_insts * 1, -1, 1, 1)
|
| 41 |
+
bias_splits[l] = bias_splits[l].reshape(num_insts)
|
| 42 |
+
|
| 43 |
+
return weight_splits, bias_splits
|
| 44 |
+
|
| 45 |
+
def build_dynamic_mask_head(cfg):
|
| 46 |
+
return DynamicMaskHead(cfg)
|
| 47 |
+
|
| 48 |
+
class DynamicMaskHead(nn.Module):
|
| 49 |
+
def __init__(self, cfg):
|
| 50 |
+
super(DynamicMaskHead, self).__init__()
|
| 51 |
+
self.num_layers = cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS
|
| 52 |
+
self.channels = cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS
|
| 53 |
+
self.in_channels = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
|
| 54 |
+
self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
|
| 55 |
+
self.disable_rel_coords = cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS
|
| 56 |
+
self.cluster_weight = cfg.MODEL.CONDINST.MASK_HEAD.CLUSTER_WEIGHT
|
| 57 |
+
|
| 58 |
+
soi = [64,128,256,512,1024]
|
| 59 |
+
# self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2]))
|
| 60 |
+
self.register_buffer("sizes_of_interest", torch.tensor(soi))
|
| 61 |
+
|
| 62 |
+
weight_nums, bias_nums = [], []
|
| 63 |
+
for l in range(self.num_layers):
|
| 64 |
+
if l == 0:
|
| 65 |
+
if not self.disable_rel_coords:
|
| 66 |
+
weight_nums.append((self.in_channels + 2) * self.channels)
|
| 67 |
+
else:
|
| 68 |
+
weight_nums.append(self.in_channels * self.channels)
|
| 69 |
+
bias_nums.append(self.channels)
|
| 70 |
+
elif l == self.num_layers - 1:
|
| 71 |
+
weight_nums.append(self.channels * 1)
|
| 72 |
+
bias_nums.append(1)
|
| 73 |
+
else:
|
| 74 |
+
weight_nums.append(self.channels * self.channels)
|
| 75 |
+
bias_nums.append(self.channels)
|
| 76 |
+
|
| 77 |
+
self.weight_nums = weight_nums
|
| 78 |
+
self.bias_nums = bias_nums
|
| 79 |
+
self.num_gen_params = sum(weight_nums) + sum(bias_nums)
|
| 80 |
+
|
| 81 |
+
stable_conv_1 = nn.Sequential(nn.Conv2d(10,8,kernel_size=3, stride=1, padding=1),nn.ReLU())
|
| 82 |
+
torch.nn.init.normal_(stable_conv_1[0].weight, std=0.01)
|
| 83 |
+
torch.nn.init.constant_(stable_conv_1[0].bias, 0)
|
| 84 |
+
|
| 85 |
+
stable_conv_2 = nn.Sequential(nn.Conv2d(8,8,kernel_size=3, stride=1, padding=1),nn.ReLU())
|
| 86 |
+
torch.nn.init.normal_(stable_conv_2[0].weight, std=0.01)
|
| 87 |
+
torch.nn.init.constant_(stable_conv_2[0].bias, 0)
|
| 88 |
+
|
| 89 |
+
stable_conv_3 = nn.Conv2d(8, 1, kernel_size=3, stride=1, padding=1)
|
| 90 |
+
torch.nn.init.normal_(stable_conv_3.weight, std=0.01)
|
| 91 |
+
torch.nn.init.constant_(stable_conv_3.bias, 0)
|
| 92 |
+
self.stable = nn.ModuleList([stable_conv_1, stable_conv_2, stable_conv_3])
|
| 93 |
+
|
| 94 |
+
self.general_choose = cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC
|
| 95 |
+
self.general_choose_weight = cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC_WEIGHT
|
| 96 |
+
self.key_weight = dict()
|
| 97 |
+
for key, value in zip(self.general_choose, self.general_choose_weight):
|
| 98 |
+
self.key_weight[key]=value
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def mask_heads_forward(self, features, weights, biases, num_insts):
|
| 102 |
+
'''
|
| 103 |
+
:param features
|
| 104 |
+
:param weights: [w0, w1, ...]
|
| 105 |
+
:param bias: [b0, b1, ...]
|
| 106 |
+
:return:
|
| 107 |
+
'''
|
| 108 |
+
assert features.dim() == 4
|
| 109 |
+
n_layers = len(weights)
|
| 110 |
+
x = features
|
| 111 |
+
mid_features = []
|
| 112 |
+
for i, (w, b) in enumerate(zip(weights, biases)):
|
| 113 |
+
x = F.conv2d(x, w, bias=b, stride=1, padding=0, groups=num_insts)
|
| 114 |
+
if i < n_layers - 1:
|
| 115 |
+
x = F.relu(x)
|
| 116 |
+
mid_features.append(x)
|
| 117 |
+
return x, mid_features
|
| 118 |
+
|
| 119 |
+
def mask_heads_forward_split(self, features, weight, bias, num_insts, has_relu=True):
|
| 120 |
+
'''
|
| 121 |
+
:param features
|
| 122 |
+
:param weights: [w0, w1, ...]
|
| 123 |
+
:param bias: [b0, b1, ...]
|
| 124 |
+
:return:
|
| 125 |
+
'''
|
| 126 |
+
assert features.dim() == 4
|
| 127 |
+
# n_layers = len(weights)
|
| 128 |
+
x = features
|
| 129 |
+
x = F.conv2d(x, weight, bias=bias, stride=1, padding=0, groups=num_insts)
|
| 130 |
+
if has_relu:
|
| 131 |
+
x = F.relu(x)
|
| 132 |
+
return x
|
| 133 |
+
|
| 134 |
+
def mask_heads_forward_with_coords_test(self, mask_feats, mask_feat_stride, instances):
|
| 135 |
+
locations = compute_locations(mask_feats.size(2), mask_feats.size(3), stride=mask_feat_stride, device=mask_feats.device)
|
| 136 |
+
n_inst = len(instances)
|
| 137 |
+
|
| 138 |
+
im_inds = instances.im_inds
|
| 139 |
+
mask_head_params = instances.mask_head_params
|
| 140 |
+
|
| 141 |
+
N, _, H, W = mask_feats.size()
|
| 142 |
+
|
| 143 |
+
if not self.disable_rel_coords:
|
| 144 |
+
instance_locations = instances.locations
|
| 145 |
+
relative_coords = instance_locations.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
|
| 146 |
+
relative_coords = relative_coords.permute(0, 2, 1).float()
|
| 147 |
+
soi = self.sizes_of_interest.float()[instances.fpn_levels]
|
| 148 |
+
relative_coords = relative_coords / soi.reshape(-1, 1, 1)
|
| 149 |
+
relative_coords = relative_coords.to(dtype=mask_feats.dtype)
|
| 150 |
+
|
| 151 |
+
mask_head_inputs = torch.cat([
|
| 152 |
+
relative_coords, mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
|
| 153 |
+
], dim=1)
|
| 154 |
+
else:
|
| 155 |
+
mask_head_inputs = mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
|
| 156 |
+
|
| 157 |
+
mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
|
| 158 |
+
|
| 159 |
+
weights, biases = parse_dynamic_params(
|
| 160 |
+
mask_head_params, self.channels,
|
| 161 |
+
self.weight_nums, self.bias_nums
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
mask_logits, mid_features = self.mask_heads_forward(mask_head_inputs, weights, biases, n_inst)
|
| 165 |
+
|
| 166 |
+
mask_logits = mask_logits.reshape(-1, 1, H, W)
|
| 167 |
+
|
| 168 |
+
assert mask_feat_stride >= self.mask_out_stride
|
| 169 |
+
assert mask_feat_stride % self.mask_out_stride == 0
|
| 170 |
+
mask_logits = aligned_bilinear(mask_logits, int(mask_feat_stride / self.mask_out_stride))
|
| 171 |
+
|
| 172 |
+
return mask_logits.sigmoid()
|
| 173 |
+
|
| 174 |
+
def mask_heads_forward_with_coords(self, mask_feats, mask_feat_stride, instances, gt_bitmasks, ignore_maps):
|
| 175 |
+
locations = compute_locations(mask_feats.size(2), mask_feats.size(3), stride=mask_feat_stride, device=mask_feats.device)
|
| 176 |
+
n_inst = len(instances)
|
| 177 |
+
|
| 178 |
+
im_inds = instances.im_inds
|
| 179 |
+
mask_head_params = instances.mask_head_params
|
| 180 |
+
|
| 181 |
+
# clusters
|
| 182 |
+
gt_inds = instances.gt_inds
|
| 183 |
+
instance_locations = instances.locations
|
| 184 |
+
fpn_levels = instances.fpn_levels
|
| 185 |
+
|
| 186 |
+
clusters_ids = []
|
| 187 |
+
clusters_imgids = []
|
| 188 |
+
clusters_gt_masks = []
|
| 189 |
+
gt_unique_inds = torch.unique(gt_inds)
|
| 190 |
+
for gt_ind in gt_unique_inds:
|
| 191 |
+
gt_ind = int(gt_ind)
|
| 192 |
+
clusters_gt_masks.append(gt_bitmasks[gt_ind])
|
| 193 |
+
im_ind = int(torch.unique(im_inds[(gt_inds == gt_ind)]))
|
| 194 |
+
clusters_ids.append(gt_ind)
|
| 195 |
+
clusters_imgids.append(im_ind)
|
| 196 |
+
|
| 197 |
+
clusters_ids = torch.tensor(clusters_ids).cuda()
|
| 198 |
+
clusters_imgids = torch.tensor(clusters_imgids)
|
| 199 |
+
clusters_gt_masks = torch.stack(clusters_gt_masks, dim=0)
|
| 200 |
+
n_clusters = len(clusters_ids)
|
| 201 |
+
|
| 202 |
+
N, _, H, W = mask_feats.size()
|
| 203 |
+
|
| 204 |
+
if not self.disable_rel_coords:
|
| 205 |
+
instance_locations = instances.locations
|
| 206 |
+
relative_coords = instance_locations.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
|
| 207 |
+
relative_coords = relative_coords.permute(0, 2, 1).float()
|
| 208 |
+
soi = self.sizes_of_interest.float()[instances.fpn_levels]
|
| 209 |
+
relative_coords = relative_coords / soi.reshape(-1, 1, 1)
|
| 210 |
+
relative_coords = relative_coords.to(dtype=mask_feats.dtype)
|
| 211 |
+
mask_head_inputs = torch.cat([relative_coords, mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)], dim=1)
|
| 212 |
+
else:
|
| 213 |
+
mask_head_inputs = mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
|
| 214 |
+
|
| 215 |
+
# mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
|
| 216 |
+
mask_head_inputs = mask_head_inputs.reshape(n_inst, self.in_channels+2, H, W)
|
| 217 |
+
weights, biases = parse_dynamic_params(mask_head_params, self.channels, self.weight_nums, self.bias_nums)
|
| 218 |
+
|
| 219 |
+
feature0 = self.stable[0](mask_head_inputs)
|
| 220 |
+
feature1 = self.mask_heads_forward_split(mask_head_inputs.reshape(1, -1, H, W), weights[0], biases[0], n_inst).reshape(n_inst, -1, H, W)
|
| 221 |
+
|
| 222 |
+
feature00 = self.stable[1](feature0)
|
| 223 |
+
feature01 = self.mask_heads_forward_split(feature0.reshape(1, -1, H, W), weights[1], biases[1], n_inst).reshape(n_inst, -1, H, W)
|
| 224 |
+
feature10 = self.stable[1](feature1)
|
| 225 |
+
feature11 = self.mask_heads_forward_split(feature1.reshape(1, -1, H, W), weights[1], biases[1], n_inst).reshape(n_inst, -1, H, W)
|
| 226 |
+
|
| 227 |
+
feature001 = self.mask_heads_forward_split(feature00.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
|
| 228 |
+
feature010 = self.stable[2](feature01)
|
| 229 |
+
feature011 = self.mask_heads_forward_split(feature01.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
|
| 230 |
+
|
| 231 |
+
feature100 = self.stable[2](feature10)
|
| 232 |
+
feature101 = self.mask_heads_forward_split(feature10.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
|
| 233 |
+
feature110 = self.stable[2](feature11)
|
| 234 |
+
feature111 = self.mask_heads_forward_split(feature11.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
|
| 235 |
+
|
| 236 |
+
mask_logits_clusters = []
|
| 237 |
+
for gt_ind in clusters_ids:
|
| 238 |
+
gt_ind = int(gt_ind)
|
| 239 |
+
mask_logits_clusters.append(torch.mean(feature111[gt_inds==gt_ind], dim=0))
|
| 240 |
+
mask_logits_clusters = torch.stack(mask_logits_clusters, dim=0)
|
| 241 |
+
mask_logits_clusters = mask_logits_clusters.reshape(-1, 1, H, W)
|
| 242 |
+
mask_logits_clusters = aligned_bilinear(mask_logits_clusters, int(mask_feat_stride / self.mask_out_stride))
|
| 243 |
+
# clusters
|
| 244 |
+
unique_img_inds = torch.unique(clusters_imgids)
|
| 245 |
+
mask_logits_clusters_imgs = []
|
| 246 |
+
mask_gt_clusters_imgs = []
|
| 247 |
+
for img_ind in unique_img_inds:
|
| 248 |
+
img_ind = int(img_ind)
|
| 249 |
+
mask_logits_clusters_per_img = mask_logits_clusters[clusters_imgids==img_ind]
|
| 250 |
+
mask_logits_clusters_per_img = F.softmax(mask_logits_clusters_per_img.squeeze(1),dim=0).unsqueeze(1)
|
| 251 |
+
|
| 252 |
+
ignore_map = ignore_maps[img_ind].detach()
|
| 253 |
+
finds_y, finds_x = torch.nonzero(ignore_map, as_tuple=True)
|
| 254 |
+
|
| 255 |
+
mask_logits_clusters_per_img = mask_logits_clusters_per_img.clone()
|
| 256 |
+
mask_logits_clusters_per_img[...,finds_y,finds_x] = 0
|
| 257 |
+
|
| 258 |
+
mask_logits_clusters_imgs.append(mask_logits_clusters_per_img)
|
| 259 |
+
mask_gt_clusters_imgs.append(clusters_gt_masks[clusters_imgids==img_ind])
|
| 260 |
+
mask_logits_clusters_imgs = torch.cat(mask_logits_clusters_imgs, dim=0)
|
| 261 |
+
mask_gt_clusters_imgs = torch.cat(mask_gt_clusters_imgs, dim=0)
|
| 262 |
+
|
| 263 |
+
select_features = {}
|
| 264 |
+
for cid in self.general_choose:
|
| 265 |
+
select_feature = locals()["feature{}".format(cid)]
|
| 266 |
+
select_feature = aligned_bilinear(select_feature, int(mask_feat_stride / self.mask_out_stride))
|
| 267 |
+
select_features[cid] = select_feature.sigmoid()
|
| 268 |
+
|
| 269 |
+
return select_features, mask_logits_clusters_imgs, mask_gt_clusters_imgs.unsqueeze(1)
|
| 270 |
+
|
| 271 |
+
def __call__(self, mask_feats, mask_feat_stride, pred_instances, gt_instances=None):
|
| 272 |
+
if self.training:
|
| 273 |
+
gt_inds = pred_instances.gt_inds
|
| 274 |
+
gt_bitmasks_s = torch.cat([per_im.gt_bitmasks for per_im in gt_instances])
|
| 275 |
+
gt_bitmasks = gt_bitmasks_s[gt_inds].unsqueeze(dim=1).to(dtype=mask_feats.dtype)
|
| 276 |
+
|
| 277 |
+
bitmasks_full = []
|
| 278 |
+
for gt_instance in gt_instances:
|
| 279 |
+
bitmasks_full.append(gt_instance.gt_bitmasks.sum(dim=0))
|
| 280 |
+
bitmasks_full = torch.stack(bitmasks_full)
|
| 281 |
+
ignore_map = 1-bitmasks_full
|
| 282 |
+
|
| 283 |
+
losses = {}
|
| 284 |
+
if len(pred_instances) == 0:
|
| 285 |
+
loss_mask = mask_feats.sum() * 0 + pred_instances.mask_head_params.sum() * 0
|
| 286 |
+
for key, value in self.key_weight.items():
|
| 287 |
+
losses["loss_mask_bank_{}".format(key)] = loss_mask
|
| 288 |
+
losses["loss_mask_cluster"] = loss_mask
|
| 289 |
+
else:
|
| 290 |
+
select_scores, mask_logits_clusters, mask_gts_clusters = self.mask_heads_forward_with_coords(mask_feats, mask_feat_stride, pred_instances, gt_bitmasks_s, ignore_map)
|
| 291 |
+
for key, value in select_scores.items():
|
| 292 |
+
losses["loss_mask_bank_{}".format(key)] = dice_coefficient(value, gt_bitmasks).mean() * self.key_weight[key]
|
| 293 |
+
|
| 294 |
+
mask_clusters_losses = dice_coefficient(mask_logits_clusters, mask_gts_clusters)
|
| 295 |
+
mask_clusters_losses = mask_clusters_losses.mean()
|
| 296 |
+
losses["loss_mask_cluster"] = mask_clusters_losses * self.cluster_weight
|
| 297 |
+
return losses
|
| 298 |
+
else:
|
| 299 |
+
if len(pred_instances) > 0:
|
| 300 |
+
mask_scores = self.mask_heads_forward_with_coords_test(mask_feats, mask_feat_stride, pred_instances)
|
| 301 |
+
pred_instances.pred_global_masks = mask_scores.float()
|
| 302 |
+
|
| 303 |
+
return pred_instances
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/mask_branch.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict
|
| 2 |
+
import math
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
from torch import nn
|
| 6 |
+
import pdb
|
| 7 |
+
from fvcore.nn import sigmoid_focal_loss_jit
|
| 8 |
+
from detectron2.layers import ShapeSpec
|
| 9 |
+
|
| 10 |
+
from ..det_head.layers import conv_with_kaiming_uniform
|
| 11 |
+
from ..det_head.utils.comm import aligned_bilinear
|
| 12 |
+
|
| 13 |
+
INF = 100000000
|
| 14 |
+
|
| 15 |
+
def build_mask_branch(cfg, input_shape):
|
| 16 |
+
return MaskBranch(cfg, input_shape)
|
| 17 |
+
|
| 18 |
+
class MaskBranch(nn.Module):
|
| 19 |
+
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
|
| 20 |
+
super().__init__()
|
| 21 |
+
self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES
|
| 22 |
+
self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON
|
| 23 |
+
self.num_outputs = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
|
| 24 |
+
norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM
|
| 25 |
+
num_convs = cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS
|
| 26 |
+
channels = cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS
|
| 27 |
+
self.out_stride = input_shape[self.in_features[0]].stride
|
| 28 |
+
|
| 29 |
+
feature_channels = {k: v.channels for k, v in input_shape.items()}
|
| 30 |
+
|
| 31 |
+
conv_block = conv_with_kaiming_uniform(norm, activation=True)
|
| 32 |
+
|
| 33 |
+
self.refine = nn.ModuleList()
|
| 34 |
+
for in_feature in self.in_features:
|
| 35 |
+
self.refine.append(conv_block(
|
| 36 |
+
feature_channels[in_feature],
|
| 37 |
+
channels, 3, 1
|
| 38 |
+
))
|
| 39 |
+
|
| 40 |
+
tower = []
|
| 41 |
+
for i in range(num_convs):
|
| 42 |
+
tower.append(conv_block(
|
| 43 |
+
channels, channels, 3, 1
|
| 44 |
+
))
|
| 45 |
+
tower.append(nn.Conv2d(
|
| 46 |
+
channels, max(self.num_outputs, 1), 1
|
| 47 |
+
))
|
| 48 |
+
self.add_module('tower', nn.Sequential(*tower))
|
| 49 |
+
|
| 50 |
+
def forward(self, features, gt_instances=None):
|
| 51 |
+
for i, f in enumerate(self.in_features):
|
| 52 |
+
if i == 0:
|
| 53 |
+
x = self.refine[i](features[f])
|
| 54 |
+
else:
|
| 55 |
+
x_p = self.refine[i](features[f])
|
| 56 |
+
|
| 57 |
+
target_h, target_w = x.size()[2:]
|
| 58 |
+
h, w = x_p.size()[2:]
|
| 59 |
+
assert target_h % h == 0
|
| 60 |
+
assert target_w % w == 0
|
| 61 |
+
factor_h, factor_w = target_h // h, target_w // w
|
| 62 |
+
assert factor_h == factor_w
|
| 63 |
+
x_p = aligned_bilinear(x_p, factor_h)
|
| 64 |
+
x = x + x_p
|
| 65 |
+
|
| 66 |
+
mask_feats = self.tower(x)
|
| 67 |
+
|
| 68 |
+
if self.num_outputs == 0:
|
| 69 |
+
mask_feats = mask_feats[:, :self.num_outputs]
|
| 70 |
+
|
| 71 |
+
return mask_feats
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/utils.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch.nn import functional as F
|
| 3 |
+
import pdb
|
| 4 |
+
|
| 5 |
+
def sigmoid_focal_loss_boundary(
|
| 6 |
+
inputs: torch.Tensor,
|
| 7 |
+
targets: torch.Tensor,
|
| 8 |
+
boundary: torch.Tensor,
|
| 9 |
+
alpha: float = -1,
|
| 10 |
+
gamma: float = 2,
|
| 11 |
+
reduction: str = "none",
|
| 12 |
+
) -> torch.Tensor:
|
| 13 |
+
"""
|
| 14 |
+
Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
|
| 15 |
+
Args:
|
| 16 |
+
inputs: A float tensor of arbitrary shape.
|
| 17 |
+
The predictions for each example.
|
| 18 |
+
targets: A float tensor with the same shape as inputs. Stores the binary
|
| 19 |
+
classification label for each element in inputs
|
| 20 |
+
(0 for the negative class and 1 for the positive class).
|
| 21 |
+
alpha: (optional) Weighting factor in range (0,1) to balance
|
| 22 |
+
positive vs negative examples. Default = -1 (no weighting).
|
| 23 |
+
gamma: Exponent of the modulating factor (1 - p_t) to
|
| 24 |
+
balance easy vs hard examples.
|
| 25 |
+
reduction: 'none' | 'mean' | 'sum'
|
| 26 |
+
'none': No reduction will be applied to the output.
|
| 27 |
+
'mean': The output will be averaged.
|
| 28 |
+
'sum': The output will be summed.
|
| 29 |
+
Returns:
|
| 30 |
+
Loss tensor with the reduction option applied.
|
| 31 |
+
"""
|
| 32 |
+
p = torch.sigmoid(inputs)
|
| 33 |
+
ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
|
| 34 |
+
p_t = p * targets + (1 - p) * (1 - targets)
|
| 35 |
+
loss = ce_loss * ((1 - p_t) ** gamma)
|
| 36 |
+
|
| 37 |
+
if alpha >= 0:
|
| 38 |
+
alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
|
| 39 |
+
loss = alpha_t * loss
|
| 40 |
+
|
| 41 |
+
loss = loss * boundary
|
| 42 |
+
# pdb.set_trace()
|
| 43 |
+
if reduction == "mean":
|
| 44 |
+
loss = loss.mean()
|
| 45 |
+
elif reduction == "sum":
|
| 46 |
+
loss = loss.sum()
|
| 47 |
+
|
| 48 |
+
return loss
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
sigmoid_focal_loss_boundary_jit = torch.jit.script(
|
| 52 |
+
sigmoid_focal_loss_boundary
|
| 53 |
+
) # type: torch.jit.ScriptModule
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/__init__.py
ADDED
|
File without changes
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/deformable_conv_with_off.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding:utf-8 -*-
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
|
| 6 |
+
from detectron2.layers.deform_conv import DeformConv, ModulatedDeformConv
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class DeformConvWithOff(nn.Module):
|
| 10 |
+
|
| 11 |
+
def __init__(self, in_channels, out_channels,
|
| 12 |
+
kernel_size=3, stride=1, padding=1,
|
| 13 |
+
dilation=1, deformable_groups=1):
|
| 14 |
+
super(DeformConvWithOff, self).__init__()
|
| 15 |
+
self.offset_conv = nn.Conv2d(
|
| 16 |
+
in_channels,
|
| 17 |
+
deformable_groups * 2 * kernel_size * kernel_size,
|
| 18 |
+
kernel_size=kernel_size,
|
| 19 |
+
stride=stride,
|
| 20 |
+
padding=padding,
|
| 21 |
+
)
|
| 22 |
+
self.dcn = DeformConv(
|
| 23 |
+
in_channels, out_channels, kernel_size=kernel_size,
|
| 24 |
+
stride=stride, padding=padding, dilation=dilation,
|
| 25 |
+
deformable_groups=deformable_groups,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
def forward(self, input):
|
| 29 |
+
offset = self.offset_conv(input)
|
| 30 |
+
output = self.dcn(input, offset)
|
| 31 |
+
return output
|
| 32 |
+
|
| 33 |
+
class ModulatedDeformConvWithOff(nn.Module):
|
| 34 |
+
def __init__(self, in_channels, out_channels,
|
| 35 |
+
kernel_size=3, stride=1, padding=1,
|
| 36 |
+
dilation=1, deformable_groups=1,
|
| 37 |
+
bias=True, norm=None, activation=None,):
|
| 38 |
+
super(ModulatedDeformConvWithOff, self).__init__()
|
| 39 |
+
self.offset_mask_conv = nn.Conv2d(
|
| 40 |
+
in_channels,
|
| 41 |
+
deformable_groups * 3 * kernel_size * kernel_size,
|
| 42 |
+
kernel_size=kernel_size,
|
| 43 |
+
stride=stride,
|
| 44 |
+
padding=padding,
|
| 45 |
+
)
|
| 46 |
+
self.dcnv2 = ModulatedDeformConv(
|
| 47 |
+
in_channels, out_channels, kernel_size=kernel_size,
|
| 48 |
+
stride=stride, padding=padding, dilation=dilation,
|
| 49 |
+
deformable_groups=deformable_groups,
|
| 50 |
+
bias=bias, norm=norm, activation=activation,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
def forward(self, input):
|
| 54 |
+
x = self.offset_mask_conv(input)
|
| 55 |
+
o1, o2, mask = torch.chunk(x, 3, dim=1)
|
| 56 |
+
offset = torch.cat((o1, o2), dim=1)
|
| 57 |
+
mask = torch.sigmoid(mask)
|
| 58 |
+
output = self.dcnv2(input, offset, mask)
|
| 59 |
+
return output
|
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/panopticfcn_head.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding:utf-8 -*-
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
|
| 7 |
+
from detectron2.layers import Conv2d, get_norm
|
| 8 |
+
from .deformable_conv_with_off import ModulatedDeformConvWithOff
|
| 9 |
+
from ..det_head.layers import conv_with_kaiming_uniform
|
| 10 |
+
import math
|
| 11 |
+
import pdb
|
| 12 |
+
from fvcore.nn import sigmoid_focal_loss_jit
|
| 13 |
+
|
| 14 |
+
class SingleHead(nn.Module):
|
| 15 |
+
"""
|
| 16 |
+
Build single head with convolutions and coord conv.
|
| 17 |
+
"""
|
| 18 |
+
def __init__(self, in_channel, conv_dims, num_convs, deform=False, coord=False, norm='', name=''):
|
| 19 |
+
super().__init__()
|
| 20 |
+
self.coord = coord
|
| 21 |
+
self.conv_norm_relus = []
|
| 22 |
+
if deform:
|
| 23 |
+
conv_module = ModulatedDeformConvWithOff
|
| 24 |
+
else:
|
| 25 |
+
conv_module = Conv2d
|
| 26 |
+
for k in range(num_convs):
|
| 27 |
+
conv = conv_module(
|
| 28 |
+
in_channel if k==0 else conv_dims,
|
| 29 |
+
conv_dims,
|
| 30 |
+
kernel_size=3,
|
| 31 |
+
stride=1,
|
| 32 |
+
padding=1,
|
| 33 |
+
bias=not norm,
|
| 34 |
+
norm=get_norm(norm, conv_dims),
|
| 35 |
+
activation=F.relu,
|
| 36 |
+
)
|
| 37 |
+
self.add_module("{}_head_{}".format(name, k + 1), conv)
|
| 38 |
+
self.conv_norm_relus.append(conv)
|
| 39 |
+
|
| 40 |
+
def forward(self, x):
|
| 41 |
+
if self.coord:
|
| 42 |
+
x = self.coord_conv(x)
|
| 43 |
+
for layer in self.conv_norm_relus:
|
| 44 |
+
x = layer(x)
|
| 45 |
+
return x
|
| 46 |
+
|
| 47 |
+
def coord_conv(self, feat):
|
| 48 |
+
with torch.no_grad():
|
| 49 |
+
x_pos = torch.linspace(-1, 1, feat.shape[-2], device=feat.device)
|
| 50 |
+
y_pos = torch.linspace(-1, 1, feat.shape[-1], device=feat.device)
|
| 51 |
+
grid_x, grid_y = torch.meshgrid(x_pos, y_pos)
|
| 52 |
+
grid_x = grid_x.unsqueeze(0).unsqueeze(1).expand(feat.shape[0], -1, -1, -1)
|
| 53 |
+
grid_y = grid_y.unsqueeze(0).unsqueeze(1).expand(feat.shape[0], -1, -1, -1)
|
| 54 |
+
feat = torch.cat([feat, grid_x, grid_y], dim=1)
|
| 55 |
+
return feat
|
| 56 |
+
|
| 57 |
+
class KernelHead(nn.Module):
|
| 58 |
+
"""
|
| 59 |
+
The head used in PanopticFCN to generate kernel weights for both Things and Stuff.
|
| 60 |
+
"""
|
| 61 |
+
def __init__(self, cfg, num_gen_params):
|
| 62 |
+
super().__init__()
|
| 63 |
+
in_channel = cfg.MODEL.FPN.OUT_CHANNELS
|
| 64 |
+
conv_dims = cfg.MODEL.KERNEL_HEAD.CONVS_DIM
|
| 65 |
+
num_convs = cfg.MODEL.KERNEL_HEAD.NUM_CONVS
|
| 66 |
+
deform = cfg.MODEL.KERNEL_HEAD.DEFORM
|
| 67 |
+
coord = cfg.MODEL.KERNEL_HEAD.COORD
|
| 68 |
+
norm = cfg.MODEL.KERNEL_HEAD.NORM
|
| 69 |
+
|
| 70 |
+
self.num_gen_params = num_gen_params
|
| 71 |
+
|
| 72 |
+
self.kernel_head = SingleHead(in_channel+2 if coord else in_channel,
|
| 73 |
+
conv_dims,
|
| 74 |
+
num_convs,
|
| 75 |
+
deform=deform,
|
| 76 |
+
coord=coord,
|
| 77 |
+
norm=norm,
|
| 78 |
+
name='kernel_head')
|
| 79 |
+
self.out_conv = Conv2d(conv_dims, self.num_gen_params, kernel_size=3, padding=1)
|
| 80 |
+
nn.init.normal_(self.out_conv.weight, mean=0, std=0.01)
|
| 81 |
+
if self.out_conv.bias is not None:
|
| 82 |
+
nn.init.constant_(self.out_conv.bias, 0)
|
| 83 |
+
|
| 84 |
+
def forward(self, feat):
|
| 85 |
+
x = self.kernel_head(feat)
|
| 86 |
+
x = self.out_conv(x)
|
| 87 |
+
return x
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class FeatureEncoder(nn.Module):
|
| 91 |
+
"""
|
| 92 |
+
The head used in PanopticFCN for high-resolution feature generation.
|
| 93 |
+
"""
|
| 94 |
+
def __init__(self, cfg):
|
| 95 |
+
super().__init__()
|
| 96 |
+
in_channel = cfg.MODEL.SEMANTIC_FPN.CONVS_DIM
|
| 97 |
+
conv_dims = cfg.MODEL.FEATURE_ENCODER.CONVS_DIM
|
| 98 |
+
num_convs = cfg.MODEL.FEATURE_ENCODER.NUM_CONVS
|
| 99 |
+
deform = cfg.MODEL.FEATURE_ENCODER.DEFORM
|
| 100 |
+
coord = cfg.MODEL.FEATURE_ENCODER.COORD
|
| 101 |
+
norm = cfg.MODEL.FEATURE_ENCODER.NORM
|
| 102 |
+
|
| 103 |
+
self.encode_head = SingleHead(in_channel+2 if coord else in_channel,
|
| 104 |
+
conv_dims,
|
| 105 |
+
num_convs,
|
| 106 |
+
deform=deform,
|
| 107 |
+
coord=coord,
|
| 108 |
+
norm=norm,
|
| 109 |
+
name='encode_head')
|
| 110 |
+
|
| 111 |
+
def forward(self, feat):
|
| 112 |
+
feat = self.encode_head(feat)
|
| 113 |
+
return feat
|
| 114 |
+
|
| 115 |
+
class FeatureEncoderEdge(nn.Module):
|
| 116 |
+
"""
|
| 117 |
+
The head used in PanopticFCN for high-resolution feature generation.
|
| 118 |
+
"""
|
| 119 |
+
def __init__(self, cfg):
|
| 120 |
+
super().__init__()
|
| 121 |
+
in_channel = cfg.MODEL.SEMANTIC_FPN.CONVS_DIM
|
| 122 |
+
conv_dims = cfg.MODEL.FEATURE_ENCODER.CONVS_DIM
|
| 123 |
+
num_convs = cfg.MODEL.FEATURE_ENCODER.NUM_CONVS
|
| 124 |
+
deform = cfg.MODEL.FEATURE_ENCODER.DEFORM
|
| 125 |
+
coord = cfg.MODEL.FEATURE_ENCODER.COORD
|
| 126 |
+
norm = cfg.MODEL.FEATURE_ENCODER.NORM
|
| 127 |
+
|
| 128 |
+
self.encode_head = SingleHead(in_channel+2 if coord else in_channel,
|
| 129 |
+
conv_dims,
|
| 130 |
+
num_convs,
|
| 131 |
+
deform=deform,
|
| 132 |
+
coord=coord,
|
| 133 |
+
norm=norm,
|
| 134 |
+
name='encode_head')
|
| 135 |
+
|
| 136 |
+
self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES
|
| 137 |
+
self.out_stride = 8
|
| 138 |
+
|
| 139 |
+
norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM
|
| 140 |
+
conv_block = conv_with_kaiming_uniform(norm, activation=True)
|
| 141 |
+
|
| 142 |
+
self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON
|
| 143 |
+
if self.sem_loss_on:
|
| 144 |
+
self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA
|
| 145 |
+
self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA
|
| 146 |
+
|
| 147 |
+
# in_channels = feature_channels[self.in_features[0]]
|
| 148 |
+
self.seg_head = nn.Sequential(
|
| 149 |
+
conv_block(conv_dims, conv_dims, kernel_size=3, stride=1),
|
| 150 |
+
conv_block(conv_dims, conv_dims, kernel_size=3, stride=1)
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
self.logits = nn.Conv2d(conv_dims, 1, kernel_size=1, stride=1)
|
| 154 |
+
|
| 155 |
+
prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
|
| 156 |
+
bias_value = -math.log((1 - prior_prob) / prior_prob)
|
| 157 |
+
torch.nn.init.constant_(self.logits.bias, bias_value)
|
| 158 |
+
|
| 159 |
+
def forward(self, feat, gt_instances=None):
|
| 160 |
+
feat = self.encode_head(feat)
|
| 161 |
+
|
| 162 |
+
losses = {}
|
| 163 |
+
# auxiliary thing semantic loss
|
| 164 |
+
if self.training and self.sem_loss_on:
|
| 165 |
+
logits_pred = self.logits(self.seg_head(feat))
|
| 166 |
+
|
| 167 |
+
boundary_targets = []
|
| 168 |
+
for per_im_gt in gt_instances:
|
| 169 |
+
boundary_targets.append(per_im_gt.gt_boundary_full.sum(dim=0))
|
| 170 |
+
|
| 171 |
+
# # semantic_targets = torch.stack(semantic_targets, dim=0)
|
| 172 |
+
boundary_targets = torch.stack(boundary_targets, dim=0)
|
| 173 |
+
|
| 174 |
+
# resize target to reduce memory
|
| 175 |
+
boundary_targets = boundary_targets[:, None, self.out_stride // 2::self.out_stride,self.out_stride // 2::self.out_stride]
|
| 176 |
+
num_pos = (boundary_targets > 0).sum().float().clamp(min=1.0)
|
| 177 |
+
|
| 178 |
+
loss_edge = sigmoid_focal_loss_jit(logits_pred, boundary_targets, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum") / num_pos
|
| 179 |
+
losses['loss_edge_p3'] = loss_edge
|
| 180 |
+
|
| 181 |
+
return feat, losses
|
| 182 |
+
|
| 183 |
+
def build_feature_encoder(cfg, input_shape=None):
|
| 184 |
+
return FeatureEncoder(cfg)
|
| 185 |
+
|
| 186 |
+
def build_feature_encoder_edge(cfg, input_shape=None):
|
| 187 |
+
return FeatureEncoderEdge(cfg)
|
| 188 |
+
|
| 189 |
+
def build_kernel_head(cfg, num_gen_params):
|
| 190 |
+
return KernelHead(cfg, num_gen_params)
|
MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/entity_to_json.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import copy
|
| 3 |
+
import mmcv
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pdb
|
| 6 |
+
import pycocotools.mask as mask_utils
|
| 7 |
+
from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
|
| 8 |
+
|
| 9 |
+
prefix = "train2017"
|
| 10 |
+
base_path = "/data/ceph/gavinqi/data/coco"
|
| 11 |
+
|
| 12 |
+
entity_base_path = os.path.join(base_path, "entity_{}".format(prefix))
|
| 13 |
+
annotation_path = os.path.join(base_path, "annotations/instances_{}.json".format(prefix))
|
| 14 |
+
save_thing_path = os.path.join(base_path, "annotations/entity_thing_{}.json".format(prefix))
|
| 15 |
+
save_stuff_path = os.path.join(base_path, "annotations/entity_stuff_{}.json".format(prefix))
|
| 16 |
+
save_entity_path = os.path.join(base_path, "annotations/entity_{}.json".format(prefix))
|
| 17 |
+
|
| 18 |
+
## build catid to continous
|
| 19 |
+
categories_list = COCO_CATEGORIES
|
| 20 |
+
catid_map = {category['id']: [cid, category["isthing"], category["name"], category["supercategory"]] for cid, category in enumerate(categories_list)}
|
| 21 |
+
idcat_map = {}
|
| 22 |
+
for key, value in catid_map.items():
|
| 23 |
+
idcat_map[value[0]] = [key,value[1]]
|
| 24 |
+
|
| 25 |
+
instance_annotations = mmcv.load(annotation_path)
|
| 26 |
+
instance_annotations_thing = copy.deepcopy(instance_annotations)
|
| 27 |
+
instance_annotations_stuff = copy.deepcopy(instance_annotations)
|
| 28 |
+
|
| 29 |
+
# update category
|
| 30 |
+
print("Updating categories...")
|
| 31 |
+
instance_annotations_thing["categories"] = []
|
| 32 |
+
instance_annotations_stuff["categories"] = []
|
| 33 |
+
for origin_catid, new_catid_info in catid_map.items():
|
| 34 |
+
new_catid = new_catid_info[0]
|
| 35 |
+
is_thing = new_catid_info[1]
|
| 36 |
+
name = new_catid_info[2]
|
| 37 |
+
nsuper = new_catid_info[3]
|
| 38 |
+
if is_thing:
|
| 39 |
+
instance_annotations_thing["categories"].append({"supercategory": nsuper, "id": new_catid, "name": name})
|
| 40 |
+
else:
|
| 41 |
+
instance_annotations_stuff["categories"].append({"supercategory": nsuper, "id": new_catid, "name": name})
|
| 42 |
+
print("Update category finished")
|
| 43 |
+
|
| 44 |
+
# update annotations
|
| 45 |
+
instance_annotations_thing["annotations"] = []
|
| 46 |
+
instance_annotations_stuff["annotations"] = []
|
| 47 |
+
npz_names = os.listdir(entity_base_path)
|
| 48 |
+
thing_id = 0
|
| 49 |
+
stuff_id = 0
|
| 50 |
+
|
| 51 |
+
for index, npz_name in enumerate(npz_names):
|
| 52 |
+
entity_info = np.load(os.path.join(entity_base_path, npz_name))
|
| 53 |
+
image_id = int(npz_name.split(".")[0])
|
| 54 |
+
bounding_boxes = entity_info["bounding_box"]
|
| 55 |
+
entity_id_map = entity_info["map"]
|
| 56 |
+
entity_id_map = entity_id_map[0]
|
| 57 |
+
if len(bounding_boxes)==0:
|
| 58 |
+
continue
|
| 59 |
+
# 0-x1, 1-y1, 2-x2, 3-y2, 4-category, 5-thing_or_stuff, 6-entity_id
|
| 60 |
+
thing_mask = bounding_boxes[:,5] > 0
|
| 61 |
+
stuff_mask = bounding_boxes[:,5] == 0
|
| 62 |
+
|
| 63 |
+
# begin thing
|
| 64 |
+
thing_boxes = bounding_boxes[thing_mask]
|
| 65 |
+
for thing_box in thing_boxes:
|
| 66 |
+
x1, y1, x2, y2, category_id, thing_or_stuff, entity_id = thing_box
|
| 67 |
+
area = (y2-y1) * (x2-x1)
|
| 68 |
+
if "val" in prefix:
|
| 69 |
+
mask = (entity_id_map==entity_id)
|
| 70 |
+
mask = np.array(mask, order="F", dtype="uint8")
|
| 71 |
+
rle = mask_utils.encode(mask)
|
| 72 |
+
rle["counts"] = rle["counts"].decode("utf-8")
|
| 73 |
+
|
| 74 |
+
anno = {"iscrowd": 0,
|
| 75 |
+
"area": area,
|
| 76 |
+
"image_id": image_id,
|
| 77 |
+
"bbox": [x1, y1, x2-x1, y2-y1],
|
| 78 |
+
"category_id": category_id,
|
| 79 |
+
"id": thing_id}
|
| 80 |
+
if "val" in prefix:
|
| 81 |
+
anno["segmentation"]=rle
|
| 82 |
+
|
| 83 |
+
instance_annotations_thing["annotations"].append(anno)
|
| 84 |
+
thing_id = thing_id + 1
|
| 85 |
+
|
| 86 |
+
# begin stuff
|
| 87 |
+
stuff_boxes = bounding_boxes[stuff_mask]
|
| 88 |
+
for stuff_box in stuff_boxes:
|
| 89 |
+
x1, y1, x2, y2, category_id, thing_or_stuff, entity_id = stuff_box
|
| 90 |
+
area = (y2-y1) * (x2-x1)
|
| 91 |
+
if "val" in prefix:
|
| 92 |
+
mask = (entity_id_map==entity_id)
|
| 93 |
+
mask = np.array(mask, order="F", dtype="uint8")
|
| 94 |
+
rle = mask_utils.encode(mask)
|
| 95 |
+
rle["counts"] = rle["counts"].decode("utf-8")
|
| 96 |
+
|
| 97 |
+
anno = {"iscrowd": 0,
|
| 98 |
+
"area": area,
|
| 99 |
+
"image_id": image_id,
|
| 100 |
+
"bbox": [x1, y1, x2-x1, y2-y1],
|
| 101 |
+
"category_id": category_id,
|
| 102 |
+
"id": stuff_id}
|
| 103 |
+
if "val" in prefix:
|
| 104 |
+
anno["segmentation"]=rle
|
| 105 |
+
|
| 106 |
+
instance_annotations_stuff["annotations"].append(anno)
|
| 107 |
+
stuff_id = stuff_id + 1
|
| 108 |
+
|
| 109 |
+
print("{},{}".format(index, npz_name))
|
| 110 |
+
|
| 111 |
+
mmcv.dump(instance_annotations_thing, save_thing_path)
|
| 112 |
+
mmcv.dump(instance_annotations_stuff, save_stuff_path)
|
| 113 |
+
|
| 114 |
+
thing_info = instance_annotations_thing
|
| 115 |
+
stuff_info = instance_annotations_stuff
|
| 116 |
+
|
| 117 |
+
thst = thing_info
|
| 118 |
+
thst["categories"].extend(stuff_info["categories"])
|
| 119 |
+
nums = len(thst["annotations"]) + 1
|
| 120 |
+
for index, anno in enumerate(stuff_info["annotations"]):
|
| 121 |
+
anno["id"] = index + nums
|
| 122 |
+
thst["annotations"].append(anno)
|
| 123 |
+
mmcv.dump(thst, save_entity_path)
|
MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import absolute_import
|
| 2 |
+
from __future__ import division
|
| 3 |
+
from __future__ import print_function
|
| 4 |
+
from __future__ import unicode_literals
|
| 5 |
+
|
| 6 |
+
import os, sys
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pdb
|
| 9 |
+
import mmcv
|
| 10 |
+
import copy
|
| 11 |
+
import cv2
|
| 12 |
+
from collections import OrderedDict
|
| 13 |
+
from pycocotools.coco import COCO
|
| 14 |
+
import pycocotools.mask as mask_utils
|
| 15 |
+
|
| 16 |
+
import PIL.Image as Image
|
| 17 |
+
import matplotlib.pyplot as plt
|
| 18 |
+
from skimage.segmentation import find_boundaries
|
| 19 |
+
from panopticapi.utils import IdGenerator, rgb2id
|
| 20 |
+
from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
|
| 21 |
+
|
| 22 |
+
thread_num = int(sys.argv[1])
|
| 23 |
+
thread_idx = int(sys.argv[2])
|
| 24 |
+
type_ = sys.argv[3]
|
| 25 |
+
|
| 26 |
+
OFFSET = 256 * 256 * 256
|
| 27 |
+
|
| 28 |
+
GT_base_path = "/data/ceph/gavinqi/data/coco"
|
| 29 |
+
GT_panoptic_png_path = os.path.join(GT_base_path, "panoptic_{}".format(type_))
|
| 30 |
+
GT_panoptic_json_path = os.path.join(GT_base_path, "annotations/panoptic_{}.json".format(type_))
|
| 31 |
+
GT_instance_json_path = os.path.join(GT_base_path, "annotations/instances_{}.json".format(type_))
|
| 32 |
+
save_base_path = os.path.join(GT_base_path, "entity_{}".format(type_))
|
| 33 |
+
|
| 34 |
+
if not os.path.exists(save_base_path):
|
| 35 |
+
os.makedirs(save_base_path)
|
| 36 |
+
|
| 37 |
+
coco_g = mmcv.load(GT_panoptic_json_path)
|
| 38 |
+
categories_list = COCO_CATEGORIES
|
| 39 |
+
catid_map = {category['id']: [cid, category["isthing"]] for cid, category in enumerate(categories_list)}
|
| 40 |
+
idcat_map = {}
|
| 41 |
+
for key, value in catid_map.items():
|
| 42 |
+
idcat_map[value[0]] = [key,value[1]]
|
| 43 |
+
|
| 44 |
+
name2panopticindex = OrderedDict()
|
| 45 |
+
id2name = OrderedDict()
|
| 46 |
+
|
| 47 |
+
for i_index, image_info in enumerate(coco_g["images"]):
|
| 48 |
+
file_name = image_info["file_name"].split(".")[0]
|
| 49 |
+
name2panopticindex[file_name] = {"i_index": i_index}
|
| 50 |
+
id2name[image_info["id"]] = file_name
|
| 51 |
+
|
| 52 |
+
for a_index, ann in enumerate(coco_g["annotations"]):
|
| 53 |
+
file_name = id2name[ann["image_id"]]
|
| 54 |
+
name2panopticindex[file_name]["a_index"] = a_index
|
| 55 |
+
print("build name to panoptic index finished")
|
| 56 |
+
|
| 57 |
+
# imgs and instance_anns
|
| 58 |
+
instances_api = COCO(GT_instance_json_path)
|
| 59 |
+
img_ids = instances_api.getImgIds()
|
| 60 |
+
imgs = instances_api.loadImgs(img_ids)
|
| 61 |
+
instance_anns = [instances_api.imgToAnns[img_id] for img_id in img_ids]
|
| 62 |
+
assert len(name2panopticindex.keys()) == len(imgs)
|
| 63 |
+
imgs_instancesanns = list(zip(imgs, instance_anns))
|
| 64 |
+
print("build imgs and instance_anns finished")
|
| 65 |
+
|
| 66 |
+
for img_index, (img_dict, ann_dict_list) in enumerate(imgs_instancesanns):
|
| 67 |
+
if img_index % thread_num != thread_idx:
|
| 68 |
+
continue
|
| 69 |
+
|
| 70 |
+
file_name = img_dict["file_name"].split(".")[0]
|
| 71 |
+
image_h, image_w = img_dict["height"], img_dict["width"]
|
| 72 |
+
|
| 73 |
+
## panoptic mask from panoptic annotation
|
| 74 |
+
panoptic_i_index, panoptic_a_index = name2panopticindex[file_name]["i_index"], name2panopticindex[file_name]["a_index"]
|
| 75 |
+
panoptic_img_infos = coco_g["images"][panoptic_i_index]
|
| 76 |
+
panoptic_ann_infos = coco_g["annotations"][panoptic_a_index]
|
| 77 |
+
assert panoptic_img_infos["file_name"].split(".")[0] == file_name, "Something wrong with panoptic_img_infos"
|
| 78 |
+
assert panoptic_ann_infos["file_name"].split(".")[0] == file_name, "Something wrong with panoptic_ann_infos"
|
| 79 |
+
|
| 80 |
+
panoptic = np.array(Image.open(os.path.join(GT_panoptic_png_path, file_name+".png")), dtype=np.uint8)
|
| 81 |
+
panoptic_id = rgb2id(panoptic)
|
| 82 |
+
panoptic_entity_id = np.zeros(panoptic_id.shape, dtype=np.uint8)
|
| 83 |
+
panoptic_class_id = np.zeros(panoptic_id.shape, dtype=np.uint8) + 255
|
| 84 |
+
unique_panoptic_id = np.unique(panoptic_id)
|
| 85 |
+
|
| 86 |
+
for ii, segment_info in enumerate(panoptic_ann_infos["segments_info"]):
|
| 87 |
+
if segment_info["iscrowd"] == 1:
|
| 88 |
+
continue
|
| 89 |
+
old_entity_id = segment_info["id"]
|
| 90 |
+
new_entity_id = ii + 1
|
| 91 |
+
category = segment_info["category_id"]
|
| 92 |
+
panoptic_entity_id[panoptic_id==old_entity_id] = new_entity_id
|
| 93 |
+
panoptic_class_id[panoptic_id==old_entity_id] = catid_map[category][0]
|
| 94 |
+
|
| 95 |
+
unique_ids = np.unique(panoptic_entity_id)
|
| 96 |
+
count = 1
|
| 97 |
+
|
| 98 |
+
bounding_box = []
|
| 99 |
+
for entity_id in unique_ids:
|
| 100 |
+
if entity_id == 0:
|
| 101 |
+
continue
|
| 102 |
+
mask = (panoptic_entity_id==entity_id).astype(np.uint8)
|
| 103 |
+
category = int(np.unique(panoptic_class_id[panoptic_entity_id==entity_id]))
|
| 104 |
+
|
| 105 |
+
finds_y, finds_x = np.where(mask==1)
|
| 106 |
+
y1 = int(np.min(finds_y))
|
| 107 |
+
y2 = int(np.max(finds_y))
|
| 108 |
+
x1 = int(np.min(finds_x))
|
| 109 |
+
x2 = int(np.max(finds_x))
|
| 110 |
+
thing_or_stuff = int(idcat_map[category][1])
|
| 111 |
+
bounding_box.append([x1,y1,x2,y2,category,thing_or_stuff,entity_id])
|
| 112 |
+
|
| 113 |
+
bounding_box = np.array(bounding_box)
|
| 114 |
+
|
| 115 |
+
panoptic_info = np.stack((panoptic_entity_id, panoptic_class_id), axis=0)
|
| 116 |
+
np.savez(os.path.join(save_base_path, file_name),map=panoptic_info, bounding_box=bounding_box)
|
| 117 |
+
|
| 118 |
+
print("{}, {}, {}".format(thread_idx, img_index, file_name))
|
| 119 |
+
|
MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.sh
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/bash
|
| 2 |
+
thread_num=8
|
| 3 |
+
for((i=0;i<${thread_num};i++));do
|
| 4 |
+
{
|
| 5 |
+
python3 make_entity_mask.py ${thread_num} ${i} train2017
|
| 6 |
+
}&
|
| 7 |
+
done
|
| 8 |
+
wait
|
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/Makefile
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
all:
|
| 2 |
+
# install pycocotools locally
|
| 3 |
+
python setup.py build_ext --inplace
|
| 4 |
+
rm -rf build
|
| 5 |
+
|
| 6 |
+
install:
|
| 7 |
+
# install pycocotools to the Python site-packages
|
| 8 |
+
python setup.py build_ext install
|
| 9 |
+
rm -rf build
|
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__author__ = 'tylin'
|
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.c
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.pyx
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# distutils: language = c
|
| 2 |
+
# distutils: sources = ../common/maskApi.c
|
| 3 |
+
|
| 4 |
+
#**************************************************************************
|
| 5 |
+
# Microsoft COCO Toolbox. version 2.0
|
| 6 |
+
# Data, paper, and tutorials available at: http://mscoco.org/
|
| 7 |
+
# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
|
| 8 |
+
# Licensed under the Simplified BSD License [see coco/license.txt]
|
| 9 |
+
#**************************************************************************
|
| 10 |
+
|
| 11 |
+
__author__ = 'tsungyi'
|
| 12 |
+
|
| 13 |
+
import sys
|
| 14 |
+
PYTHON_VERSION = sys.version_info[0]
|
| 15 |
+
|
| 16 |
+
# import both Python-level and C-level symbols of Numpy
|
| 17 |
+
# the API uses Numpy to interface C and Python
|
| 18 |
+
import numpy as np
|
| 19 |
+
cimport numpy as np
|
| 20 |
+
from libc.stdlib cimport malloc, free
|
| 21 |
+
|
| 22 |
+
# intialized Numpy. must do.
|
| 23 |
+
np.import_array()
|
| 24 |
+
|
| 25 |
+
# import numpy C function
|
| 26 |
+
# we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management
|
| 27 |
+
cdef extern from "numpy/arrayobject.h":
|
| 28 |
+
void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
|
| 29 |
+
|
| 30 |
+
# Declare the prototype of the C functions in MaskApi.h
|
| 31 |
+
cdef extern from "maskApi.h":
|
| 32 |
+
ctypedef unsigned int uint
|
| 33 |
+
ctypedef unsigned long siz
|
| 34 |
+
ctypedef unsigned char byte
|
| 35 |
+
ctypedef double* BB
|
| 36 |
+
ctypedef struct RLE:
|
| 37 |
+
siz h,
|
| 38 |
+
siz w,
|
| 39 |
+
siz m,
|
| 40 |
+
uint* cnts,
|
| 41 |
+
void rlesInit( RLE **R, siz n )
|
| 42 |
+
void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n )
|
| 43 |
+
void rleDecode( const RLE *R, byte *mask, siz n )
|
| 44 |
+
void rleMerge( const RLE *R, RLE *M, siz n, int intersect )
|
| 45 |
+
void rleArea( const RLE *R, siz n, uint *a )
|
| 46 |
+
void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o )
|
| 47 |
+
void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o )
|
| 48 |
+
void rleToBbox( const RLE *R, BB bb, siz n )
|
| 49 |
+
void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n )
|
| 50 |
+
void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w )
|
| 51 |
+
char* rleToString( const RLE *R )
|
| 52 |
+
void rleFrString( RLE *R, char *s, siz h, siz w )
|
| 53 |
+
|
| 54 |
+
# python class to wrap RLE array in C
|
| 55 |
+
# the class handles the memory allocation and deallocation
|
| 56 |
+
cdef class RLEs:
|
| 57 |
+
cdef RLE *_R
|
| 58 |
+
cdef siz _n
|
| 59 |
+
|
| 60 |
+
def __cinit__(self, siz n =0):
|
| 61 |
+
rlesInit(&self._R, n)
|
| 62 |
+
self._n = n
|
| 63 |
+
|
| 64 |
+
# free the RLE array here
|
| 65 |
+
def __dealloc__(self):
|
| 66 |
+
if self._R is not NULL:
|
| 67 |
+
for i in range(self._n):
|
| 68 |
+
free(self._R[i].cnts)
|
| 69 |
+
free(self._R)
|
| 70 |
+
def __getattr__(self, key):
|
| 71 |
+
if key == 'n':
|
| 72 |
+
return self._n
|
| 73 |
+
raise AttributeError(key)
|
| 74 |
+
|
| 75 |
+
# python class to wrap Mask array in C
|
| 76 |
+
# the class handles the memory allocation and deallocation
|
| 77 |
+
cdef class Masks:
|
| 78 |
+
cdef byte *_mask
|
| 79 |
+
cdef siz _h
|
| 80 |
+
cdef siz _w
|
| 81 |
+
cdef siz _n
|
| 82 |
+
|
| 83 |
+
def __cinit__(self, h, w, n):
|
| 84 |
+
self._mask = <byte*> malloc(h*w*n* sizeof(byte))
|
| 85 |
+
self._h = h
|
| 86 |
+
self._w = w
|
| 87 |
+
self._n = n
|
| 88 |
+
# def __dealloc__(self):
|
| 89 |
+
# the memory management of _mask has been passed to np.ndarray
|
| 90 |
+
# it doesn't need to be freed here
|
| 91 |
+
|
| 92 |
+
# called when passing into np.array() and return an np.ndarray in column-major order
|
| 93 |
+
def __array__(self):
|
| 94 |
+
cdef np.npy_intp shape[1]
|
| 95 |
+
shape[0] = <np.npy_intp> self._h*self._w*self._n
|
| 96 |
+
# Create a 1D array, and reshape it to fortran/Matlab column-major array
|
| 97 |
+
ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
|
| 98 |
+
# The _mask allocated by Masks is now handled by ndarray
|
| 99 |
+
PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
|
| 100 |
+
return ndarray
|
| 101 |
+
|
| 102 |
+
# internal conversion from Python RLEs object to compressed RLE format
|
| 103 |
+
def _toString(RLEs Rs):
|
| 104 |
+
cdef siz n = Rs.n
|
| 105 |
+
cdef bytes py_string
|
| 106 |
+
cdef char* c_string
|
| 107 |
+
objs = []
|
| 108 |
+
for i in range(n):
|
| 109 |
+
c_string = rleToString( <RLE*> &Rs._R[i] )
|
| 110 |
+
py_string = c_string
|
| 111 |
+
objs.append({
|
| 112 |
+
'size': [Rs._R[i].h, Rs._R[i].w],
|
| 113 |
+
'counts': py_string
|
| 114 |
+
})
|
| 115 |
+
free(c_string)
|
| 116 |
+
return objs
|
| 117 |
+
|
| 118 |
+
# internal conversion from compressed RLE format to Python RLEs object
|
| 119 |
+
def _frString(rleObjs):
|
| 120 |
+
cdef siz n = len(rleObjs)
|
| 121 |
+
Rs = RLEs(n)
|
| 122 |
+
cdef bytes py_string
|
| 123 |
+
cdef char* c_string
|
| 124 |
+
for i, obj in enumerate(rleObjs):
|
| 125 |
+
if PYTHON_VERSION == 2:
|
| 126 |
+
py_string = str(obj['counts']).encode('utf8')
|
| 127 |
+
elif PYTHON_VERSION == 3:
|
| 128 |
+
py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
|
| 129 |
+
else:
|
| 130 |
+
raise Exception('Python version must be 2 or 3')
|
| 131 |
+
c_string = py_string
|
| 132 |
+
rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
|
| 133 |
+
return Rs
|
| 134 |
+
|
| 135 |
+
# encode mask to RLEs objects
|
| 136 |
+
# list of RLE string can be generated by RLEs member function
|
| 137 |
+
def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
|
| 138 |
+
h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
|
| 139 |
+
cdef RLEs Rs = RLEs(n)
|
| 140 |
+
rleEncode(Rs._R,<byte*>mask.data,h,w,n)
|
| 141 |
+
objs = _toString(Rs)
|
| 142 |
+
return objs
|
| 143 |
+
|
| 144 |
+
# decode mask from compressed list of RLE string or RLEs object
|
| 145 |
+
def decode(rleObjs):
|
| 146 |
+
cdef RLEs Rs = _frString(rleObjs)
|
| 147 |
+
h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
|
| 148 |
+
masks = Masks(h, w, n)
|
| 149 |
+
rleDecode(<RLE*>Rs._R, masks._mask, n);
|
| 150 |
+
return np.array(masks)
|
| 151 |
+
|
| 152 |
+
def merge(rleObjs, intersect=0):
|
| 153 |
+
cdef RLEs Rs = _frString(rleObjs)
|
| 154 |
+
cdef RLEs R = RLEs(1)
|
| 155 |
+
rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
|
| 156 |
+
obj = _toString(R)[0]
|
| 157 |
+
return obj
|
| 158 |
+
|
| 159 |
+
def area(rleObjs):
|
| 160 |
+
cdef RLEs Rs = _frString(rleObjs)
|
| 161 |
+
cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
|
| 162 |
+
rleArea(Rs._R, Rs._n, _a)
|
| 163 |
+
cdef np.npy_intp shape[1]
|
| 164 |
+
shape[0] = <np.npy_intp> Rs._n
|
| 165 |
+
a = np.array((Rs._n, ), dtype=np.uint8)
|
| 166 |
+
a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
|
| 167 |
+
PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
|
| 168 |
+
return a
|
| 169 |
+
|
| 170 |
+
# iou computation. support function overload (RLEs-RLEs and bbox-bbox).
|
| 171 |
+
def iou( dt, gt, pyiscrowd ):
|
| 172 |
+
def _preproc(objs):
|
| 173 |
+
if len(objs) == 0:
|
| 174 |
+
return objs
|
| 175 |
+
if type(objs) == np.ndarray:
|
| 176 |
+
if len(objs.shape) == 1:
|
| 177 |
+
objs = objs.reshape((objs[0], 1))
|
| 178 |
+
# check if it's Nx4 bbox
|
| 179 |
+
if not len(objs.shape) == 2 or not objs.shape[1] == 4:
|
| 180 |
+
raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
|
| 181 |
+
objs = objs.astype(np.double)
|
| 182 |
+
elif type(objs) == list:
|
| 183 |
+
# check if list is in box format and convert it to np.ndarray
|
| 184 |
+
isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
|
| 185 |
+
isrle = np.all(np.array([type(obj) == dict for obj in objs]))
|
| 186 |
+
if isbox:
|
| 187 |
+
objs = np.array(objs, dtype=np.double)
|
| 188 |
+
if len(objs.shape) == 1:
|
| 189 |
+
objs = objs.reshape((1,objs.shape[0]))
|
| 190 |
+
elif isrle:
|
| 191 |
+
objs = _frString(objs)
|
| 192 |
+
else:
|
| 193 |
+
raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
|
| 194 |
+
else:
|
| 195 |
+
raise Exception('unrecognized type. The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
|
| 196 |
+
return objs
|
| 197 |
+
def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
|
| 198 |
+
rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
|
| 199 |
+
def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
|
| 200 |
+
bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
|
| 201 |
+
def _len(obj):
|
| 202 |
+
cdef siz N = 0
|
| 203 |
+
if type(obj) == RLEs:
|
| 204 |
+
N = obj.n
|
| 205 |
+
elif len(obj)==0:
|
| 206 |
+
pass
|
| 207 |
+
elif type(obj) == np.ndarray:
|
| 208 |
+
N = obj.shape[0]
|
| 209 |
+
return N
|
| 210 |
+
# convert iscrowd to numpy array
|
| 211 |
+
cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)
|
| 212 |
+
# simple type checking
|
| 213 |
+
cdef siz m, n
|
| 214 |
+
dt = _preproc(dt)
|
| 215 |
+
gt = _preproc(gt)
|
| 216 |
+
m = _len(dt)
|
| 217 |
+
n = _len(gt)
|
| 218 |
+
if m == 0 or n == 0:
|
| 219 |
+
return []
|
| 220 |
+
if not type(dt) == type(gt):
|
| 221 |
+
raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
|
| 222 |
+
|
| 223 |
+
# define local variables
|
| 224 |
+
cdef double* _iou = <double*> 0
|
| 225 |
+
cdef np.npy_intp shape[1]
|
| 226 |
+
# check type and assign iou function
|
| 227 |
+
if type(dt) == RLEs:
|
| 228 |
+
_iouFun = _rleIou
|
| 229 |
+
elif type(dt) == np.ndarray:
|
| 230 |
+
_iouFun = _bbIou
|
| 231 |
+
else:
|
| 232 |
+
raise Exception('input data type not allowed.')
|
| 233 |
+
_iou = <double*> malloc(m*n* sizeof(double))
|
| 234 |
+
iou = np.zeros((m*n, ), dtype=np.double)
|
| 235 |
+
shape[0] = <np.npy_intp> m*n
|
| 236 |
+
iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
|
| 237 |
+
PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
|
| 238 |
+
_iouFun(dt, gt, iscrowd, m, n, iou)
|
| 239 |
+
return iou.reshape((m,n), order='F')
|
| 240 |
+
|
| 241 |
+
def toBbox( rleObjs ):
|
| 242 |
+
cdef RLEs Rs = _frString(rleObjs)
|
| 243 |
+
cdef siz n = Rs.n
|
| 244 |
+
cdef BB _bb = <BB> malloc(4*n* sizeof(double))
|
| 245 |
+
rleToBbox( <const RLE*> Rs._R, _bb, n )
|
| 246 |
+
cdef np.npy_intp shape[1]
|
| 247 |
+
shape[0] = <np.npy_intp> 4*n
|
| 248 |
+
bb = np.array((1,4*n), dtype=np.double)
|
| 249 |
+
bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
|
| 250 |
+
PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
|
| 251 |
+
return bb
|
| 252 |
+
|
| 253 |
+
def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
|
| 254 |
+
cdef siz n = bb.shape[0]
|
| 255 |
+
Rs = RLEs(n)
|
| 256 |
+
rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
|
| 257 |
+
objs = _toString(Rs)
|
| 258 |
+
return objs
|
| 259 |
+
|
| 260 |
+
def frPoly( poly, siz h, siz w ):
|
| 261 |
+
cdef np.ndarray[np.double_t, ndim=1] np_poly
|
| 262 |
+
n = len(poly)
|
| 263 |
+
Rs = RLEs(n)
|
| 264 |
+
for i, p in enumerate(poly):
|
| 265 |
+
np_poly = np.array(p, dtype=np.double, order='F')
|
| 266 |
+
rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
|
| 267 |
+
objs = _toString(Rs)
|
| 268 |
+
return objs
|
| 269 |
+
|
| 270 |
+
def frUncompressedRLE(ucRles, siz h, siz w):
|
| 271 |
+
cdef np.ndarray[np.uint32_t, ndim=1] cnts
|
| 272 |
+
cdef RLE R
|
| 273 |
+
cdef uint *data
|
| 274 |
+
n = len(ucRles)
|
| 275 |
+
objs = []
|
| 276 |
+
for i in range(n):
|
| 277 |
+
Rs = RLEs(1)
|
| 278 |
+
cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
|
| 279 |
+
# time for malloc can be saved here but it's fine
|
| 280 |
+
data = <uint*> malloc(len(cnts)* sizeof(uint))
|
| 281 |
+
for j in range(len(cnts)):
|
| 282 |
+
data[j] = <uint> cnts[j]
|
| 283 |
+
R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
|
| 284 |
+
Rs._R[0] = R
|
| 285 |
+
objs.append(_toString(Rs)[0])
|
| 286 |
+
return objs
|
| 287 |
+
|
| 288 |
+
def frPyObjects(pyobj, h, w):
|
| 289 |
+
# encode rle from a list of python objects
|
| 290 |
+
if type(pyobj) == np.ndarray:
|
| 291 |
+
objs = frBbox(pyobj, h, w)
|
| 292 |
+
elif type(pyobj) == list and len(pyobj[0]) == 4:
|
| 293 |
+
objs = frBbox(pyobj, h, w)
|
| 294 |
+
elif type(pyobj) == list and len(pyobj[0]) > 4:
|
| 295 |
+
objs = frPoly(pyobj, h, w)
|
| 296 |
+
elif type(pyobj) == list and type(pyobj[0]) == dict \
|
| 297 |
+
and 'counts' in pyobj[0] and 'size' in pyobj[0]:
|
| 298 |
+
objs = frUncompressedRLE(pyobj, h, w)
|
| 299 |
+
# encode rle from single python object
|
| 300 |
+
elif type(pyobj) == list and len(pyobj) == 4:
|
| 301 |
+
objs = frBbox([pyobj], h, w)[0]
|
| 302 |
+
elif type(pyobj) == list and len(pyobj) > 4:
|
| 303 |
+
objs = frPoly([pyobj], h, w)[0]
|
| 304 |
+
elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
|
| 305 |
+
objs = frUncompressedRLE([pyobj], h, w)[0]
|
| 306 |
+
else:
|
| 307 |
+
raise Exception('input type is not supported.')
|
| 308 |
+
return objs
|
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/coco.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__author__ = 'tylin'
|
| 2 |
+
__version__ = '2.0'
|
| 3 |
+
# Interface for accessing the Microsoft COCO dataset.
|
| 4 |
+
|
| 5 |
+
# Microsoft COCO is a large image dataset designed for object detection,
|
| 6 |
+
# segmentation, and caption generation. pycocotools is a Python API that
|
| 7 |
+
# assists in loading, parsing and visualizing the annotations in COCO.
|
| 8 |
+
# Please visit http://mscoco.org/ for more information on COCO, including
|
| 9 |
+
# for the data, paper, and tutorials. The exact format of the annotations
|
| 10 |
+
# is also described on the COCO website. For example usage of the pycocotools
|
| 11 |
+
# please see pycocotools_demo.ipynb. In addition to this API, please download both
|
| 12 |
+
# the COCO images and annotations in order to run the demo.
|
| 13 |
+
|
| 14 |
+
# An alternative to using the API is to load the annotations directly
|
| 15 |
+
# into Python dictionary
|
| 16 |
+
# Using the API provides additional utility functions. Note that this API
|
| 17 |
+
# supports both *instance* and *caption* annotations. In the case of
|
| 18 |
+
# captions not all functions are defined (e.g. categories are undefined).
|
| 19 |
+
|
| 20 |
+
# The following API functions are defined:
|
| 21 |
+
# COCO - COCO api class that loads COCO annotation file and prepare data structures.
|
| 22 |
+
# decodeMask - Decode binary mask M encoded via run-length encoding.
|
| 23 |
+
# encodeMask - Encode binary mask M using run-length encoding.
|
| 24 |
+
# getAnnIds - Get ann ids that satisfy given filter conditions.
|
| 25 |
+
# getCatIds - Get cat ids that satisfy given filter conditions.
|
| 26 |
+
# getImgIds - Get img ids that satisfy given filter conditions.
|
| 27 |
+
# loadAnns - Load anns with the specified ids.
|
| 28 |
+
# loadCats - Load cats with the specified ids.
|
| 29 |
+
# loadImgs - Load imgs with the specified ids.
|
| 30 |
+
# annToMask - Convert segmentation in an annotation to binary mask.
|
| 31 |
+
# showAnns - Display the specified annotations.
|
| 32 |
+
# loadRes - Load algorithm results and create API for accessing them.
|
| 33 |
+
# download - Download COCO images from mscoco.org server.
|
| 34 |
+
# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
|
| 35 |
+
# Help on each functions can be accessed by: "help COCO>function".
|
| 36 |
+
|
| 37 |
+
# See also COCO>decodeMask,
|
| 38 |
+
# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
|
| 39 |
+
# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
|
| 40 |
+
# COCO>loadImgs, COCO>annToMask, COCO>showAnns
|
| 41 |
+
|
| 42 |
+
# Microsoft COCO Toolbox. version 2.0
|
| 43 |
+
# Data, paper, and tutorials available at: http://mscoco.org/
|
| 44 |
+
# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
|
| 45 |
+
# Licensed under the Simplified BSD License [see bsd.txt]
|
| 46 |
+
|
| 47 |
+
import json
|
| 48 |
+
import time
|
| 49 |
+
import matplotlib.pyplot as plt
|
| 50 |
+
from matplotlib.collections import PatchCollection
|
| 51 |
+
from matplotlib.patches import Polygon
|
| 52 |
+
import numpy as np
|
| 53 |
+
import copy
|
| 54 |
+
import itertools
|
| 55 |
+
from . import mask as maskUtils
|
| 56 |
+
import os
|
| 57 |
+
from collections import defaultdict
|
| 58 |
+
import sys
|
| 59 |
+
PYTHON_VERSION = sys.version_info[0]
|
| 60 |
+
if PYTHON_VERSION == 2:
|
| 61 |
+
from urllib import urlretrieve
|
| 62 |
+
elif PYTHON_VERSION == 3:
|
| 63 |
+
from urllib.request import urlretrieve
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _isArrayLike(obj):
|
| 67 |
+
return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class COCO:
|
| 71 |
+
def __init__(self, annotation_file=None, class_agnostic=False):
|
| 72 |
+
"""
|
| 73 |
+
Constructor of Microsoft COCO helper class for reading and visualizing annotations.
|
| 74 |
+
:param annotation_file (str): location of annotation file
|
| 75 |
+
:param image_folder (str): location to the folder that hosts images.
|
| 76 |
+
:return:
|
| 77 |
+
"""
|
| 78 |
+
# load dataset
|
| 79 |
+
self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
|
| 80 |
+
self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
|
| 81 |
+
if not annotation_file == None:
|
| 82 |
+
print('loading annotations into memory...')
|
| 83 |
+
tic = time.time()
|
| 84 |
+
dataset = json.load(open(annotation_file, 'r'))
|
| 85 |
+
assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
|
| 86 |
+
print('Done (t={:0.2f}s)'.format(time.time()- tic))
|
| 87 |
+
self.dataset = dataset
|
| 88 |
+
if class_agnostic:
|
| 89 |
+
self.dataset = self.to_agnostic(dataset)
|
| 90 |
+
else:
|
| 91 |
+
self.dataset = dataset
|
| 92 |
+
self.createIndex()
|
| 93 |
+
|
| 94 |
+
def to_agnostic(self,dataset):
|
| 95 |
+
# dataset["categories"] = ["supercategory": "thing", "id":1, "name": "thing"]
|
| 96 |
+
dataset["categories"] = [{"supercategory": "thing", "id":1, "name": "thing"}]
|
| 97 |
+
nums = len(dataset["annotations"])
|
| 98 |
+
for ii in range(nums):
|
| 99 |
+
dataset["annotations"][ii]["category_id"] = 1
|
| 100 |
+
return dataset
|
| 101 |
+
|
| 102 |
+
def createIndex(self):
|
| 103 |
+
# create index
|
| 104 |
+
print('creating index...')
|
| 105 |
+
anns, cats, imgs = {}, {}, {}
|
| 106 |
+
imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
|
| 107 |
+
if 'annotations' in self.dataset:
|
| 108 |
+
for ann in self.dataset['annotations']:
|
| 109 |
+
imgToAnns[ann['image_id']].append(ann)
|
| 110 |
+
anns[ann['id']] = ann
|
| 111 |
+
|
| 112 |
+
if 'images' in self.dataset:
|
| 113 |
+
for img in self.dataset['images']:
|
| 114 |
+
imgs[img['id']] = img
|
| 115 |
+
|
| 116 |
+
if 'categories' in self.dataset:
|
| 117 |
+
for cat in self.dataset['categories']:
|
| 118 |
+
cats[cat['id']] = cat
|
| 119 |
+
|
| 120 |
+
if 'annotations' in self.dataset and 'categories' in self.dataset:
|
| 121 |
+
for ann in self.dataset['annotations']:
|
| 122 |
+
catToImgs[ann['category_id']].append(ann['image_id'])
|
| 123 |
+
|
| 124 |
+
print('index created!')
|
| 125 |
+
|
| 126 |
+
# create class members
|
| 127 |
+
self.anns = anns
|
| 128 |
+
self.imgToAnns = imgToAnns
|
| 129 |
+
self.catToImgs = catToImgs
|
| 130 |
+
self.imgs = imgs
|
| 131 |
+
self.cats = cats
|
| 132 |
+
|
| 133 |
+
def info(self):
|
| 134 |
+
"""
|
| 135 |
+
Print information about the annotation file.
|
| 136 |
+
:return:
|
| 137 |
+
"""
|
| 138 |
+
for key, value in self.dataset['info'].items():
|
| 139 |
+
print('{}: {}'.format(key, value))
|
| 140 |
+
|
| 141 |
+
def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
|
| 142 |
+
"""
|
| 143 |
+
Get ann ids that satisfy given filter conditions. default skips that filter
|
| 144 |
+
:param imgIds (int array) : get anns for given imgs
|
| 145 |
+
catIds (int array) : get anns for given cats
|
| 146 |
+
areaRng (float array) : get anns for given area range (e.g. [0 inf])
|
| 147 |
+
iscrowd (boolean) : get anns for given crowd label (False or True)
|
| 148 |
+
:return: ids (int array) : integer array of ann ids
|
| 149 |
+
"""
|
| 150 |
+
imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
|
| 151 |
+
catIds = catIds if _isArrayLike(catIds) else [catIds]
|
| 152 |
+
|
| 153 |
+
if len(imgIds) == len(catIds) == len(areaRng) == 0:
|
| 154 |
+
anns = self.dataset['annotations']
|
| 155 |
+
else:
|
| 156 |
+
if not len(imgIds) == 0:
|
| 157 |
+
lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
|
| 158 |
+
anns = list(itertools.chain.from_iterable(lists))
|
| 159 |
+
else:
|
| 160 |
+
anns = self.dataset['annotations']
|
| 161 |
+
anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds]
|
| 162 |
+
anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
|
| 163 |
+
if not iscrowd == None:
|
| 164 |
+
ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
|
| 165 |
+
else:
|
| 166 |
+
ids = [ann['id'] for ann in anns]
|
| 167 |
+
return ids
|
| 168 |
+
|
| 169 |
+
def getCatIds(self, catNms=[], supNms=[], catIds=[]):
|
| 170 |
+
"""
|
| 171 |
+
filtering parameters. default skips that filter.
|
| 172 |
+
:param catNms (str array) : get cats for given cat names
|
| 173 |
+
:param supNms (str array) : get cats for given supercategory names
|
| 174 |
+
:param catIds (int array) : get cats for given cat ids
|
| 175 |
+
:return: ids (int array) : integer array of cat ids
|
| 176 |
+
"""
|
| 177 |
+
catNms = catNms if _isArrayLike(catNms) else [catNms]
|
| 178 |
+
supNms = supNms if _isArrayLike(supNms) else [supNms]
|
| 179 |
+
catIds = catIds if _isArrayLike(catIds) else [catIds]
|
| 180 |
+
|
| 181 |
+
if len(catNms) == len(supNms) == len(catIds) == 0:
|
| 182 |
+
cats = self.dataset['categories']
|
| 183 |
+
else:
|
| 184 |
+
cats = self.dataset['categories']
|
| 185 |
+
cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms]
|
| 186 |
+
cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
|
| 187 |
+
cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds]
|
| 188 |
+
ids = [cat['id'] for cat in cats]
|
| 189 |
+
return ids
|
| 190 |
+
|
| 191 |
+
def getImgIds(self, imgIds=[], catIds=[]):
|
| 192 |
+
'''
|
| 193 |
+
Get img ids that satisfy given filter conditions.
|
| 194 |
+
:param imgIds (int array) : get imgs for given ids
|
| 195 |
+
:param catIds (int array) : get imgs with all given cats
|
| 196 |
+
:return: ids (int array) : integer array of img ids
|
| 197 |
+
'''
|
| 198 |
+
imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
|
| 199 |
+
catIds = catIds if _isArrayLike(catIds) else [catIds]
|
| 200 |
+
|
| 201 |
+
if len(imgIds) == len(catIds) == 0:
|
| 202 |
+
ids = self.imgs.keys()
|
| 203 |
+
else:
|
| 204 |
+
ids = set(imgIds)
|
| 205 |
+
for i, catId in enumerate(catIds):
|
| 206 |
+
if i == 0 and len(ids) == 0:
|
| 207 |
+
ids = set(self.catToImgs[catId])
|
| 208 |
+
else:
|
| 209 |
+
ids &= set(self.catToImgs[catId])
|
| 210 |
+
return list(ids)
|
| 211 |
+
|
| 212 |
+
def loadAnns(self, ids=[]):
|
| 213 |
+
"""
|
| 214 |
+
Load anns with the specified ids.
|
| 215 |
+
:param ids (int array) : integer ids specifying anns
|
| 216 |
+
:return: anns (object array) : loaded ann objects
|
| 217 |
+
"""
|
| 218 |
+
if _isArrayLike(ids):
|
| 219 |
+
return [self.anns[id] for id in ids]
|
| 220 |
+
elif type(ids) == int:
|
| 221 |
+
return [self.anns[ids]]
|
| 222 |
+
|
| 223 |
+
def loadCats(self, ids=[]):
|
| 224 |
+
"""
|
| 225 |
+
Load cats with the specified ids.
|
| 226 |
+
:param ids (int array) : integer ids specifying cats
|
| 227 |
+
:return: cats (object array) : loaded cat objects
|
| 228 |
+
"""
|
| 229 |
+
if _isArrayLike(ids):
|
| 230 |
+
return [self.cats[id] for id in ids]
|
| 231 |
+
elif type(ids) == int:
|
| 232 |
+
return [self.cats[ids]]
|
| 233 |
+
|
| 234 |
+
def loadImgs(self, ids=[]):
|
| 235 |
+
"""
|
| 236 |
+
Load anns with the specified ids.
|
| 237 |
+
:param ids (int array) : integer ids specifying img
|
| 238 |
+
:return: imgs (object array) : loaded img objects
|
| 239 |
+
"""
|
| 240 |
+
if _isArrayLike(ids):
|
| 241 |
+
return [self.imgs[id] for id in ids]
|
| 242 |
+
elif type(ids) == int:
|
| 243 |
+
return [self.imgs[ids]]
|
| 244 |
+
|
| 245 |
+
def showAnns(self, anns, draw_bbox=False):
|
| 246 |
+
"""
|
| 247 |
+
Display the specified annotations.
|
| 248 |
+
:param anns (array of object): annotations to display
|
| 249 |
+
:return: None
|
| 250 |
+
"""
|
| 251 |
+
if len(anns) == 0:
|
| 252 |
+
return 0
|
| 253 |
+
if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
|
| 254 |
+
datasetType = 'instances'
|
| 255 |
+
elif 'caption' in anns[0]:
|
| 256 |
+
datasetType = 'captions'
|
| 257 |
+
else:
|
| 258 |
+
raise Exception('datasetType not supported')
|
| 259 |
+
if datasetType == 'instances':
|
| 260 |
+
ax = plt.gca()
|
| 261 |
+
ax.set_autoscale_on(False)
|
| 262 |
+
polygons = []
|
| 263 |
+
color = []
|
| 264 |
+
for ann in anns:
|
| 265 |
+
c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
|
| 266 |
+
if 'segmentation' in ann:
|
| 267 |
+
if type(ann['segmentation']) == list:
|
| 268 |
+
# polygon
|
| 269 |
+
for seg in ann['segmentation']:
|
| 270 |
+
poly = np.array(seg).reshape((int(len(seg)/2), 2))
|
| 271 |
+
polygons.append(Polygon(poly))
|
| 272 |
+
color.append(c)
|
| 273 |
+
else:
|
| 274 |
+
# mask
|
| 275 |
+
t = self.imgs[ann['image_id']]
|
| 276 |
+
if type(ann['segmentation']['counts']) == list:
|
| 277 |
+
rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
|
| 278 |
+
else:
|
| 279 |
+
rle = [ann['segmentation']]
|
| 280 |
+
m = maskUtils.decode(rle)
|
| 281 |
+
img = np.ones( (m.shape[0], m.shape[1], 3) )
|
| 282 |
+
if ann['iscrowd'] == 1:
|
| 283 |
+
color_mask = np.array([2.0,166.0,101.0])/255
|
| 284 |
+
if ann['iscrowd'] == 0:
|
| 285 |
+
color_mask = np.random.random((1, 3)).tolist()[0]
|
| 286 |
+
for i in range(3):
|
| 287 |
+
img[:,:,i] = color_mask[i]
|
| 288 |
+
ax.imshow(np.dstack( (img, m*0.5) ))
|
| 289 |
+
if 'keypoints' in ann and type(ann['keypoints']) == list:
|
| 290 |
+
# turn skeleton into zero-based index
|
| 291 |
+
sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
|
| 292 |
+
kp = np.array(ann['keypoints'])
|
| 293 |
+
x = kp[0::3]
|
| 294 |
+
y = kp[1::3]
|
| 295 |
+
v = kp[2::3]
|
| 296 |
+
for sk in sks:
|
| 297 |
+
if np.all(v[sk]>0):
|
| 298 |
+
plt.plot(x[sk],y[sk], linewidth=3, color=c)
|
| 299 |
+
plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
|
| 300 |
+
plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
|
| 301 |
+
|
| 302 |
+
if draw_bbox:
|
| 303 |
+
[bbox_x, bbox_y, bbox_w, bbox_h] = ann['bbox']
|
| 304 |
+
poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
|
| 305 |
+
np_poly = np.array(poly).reshape((4,2))
|
| 306 |
+
polygons.append(Polygon(np_poly))
|
| 307 |
+
color.append(c)
|
| 308 |
+
|
| 309 |
+
p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
|
| 310 |
+
ax.add_collection(p)
|
| 311 |
+
p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
|
| 312 |
+
ax.add_collection(p)
|
| 313 |
+
elif datasetType == 'captions':
|
| 314 |
+
for ann in anns:
|
| 315 |
+
print(ann['caption'])
|
| 316 |
+
|
| 317 |
+
def loadRes(self, resFile):
|
| 318 |
+
"""
|
| 319 |
+
Load result file and return a result api object.
|
| 320 |
+
:param resFile (str) : file name of result file
|
| 321 |
+
:return: res (obj) : result api object
|
| 322 |
+
"""
|
| 323 |
+
res = COCO()
|
| 324 |
+
res.dataset['images'] = [img for img in self.dataset['images']]
|
| 325 |
+
|
| 326 |
+
print('Loading and preparing results...')
|
| 327 |
+
tic = time.time()
|
| 328 |
+
if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode):
|
| 329 |
+
anns = json.load(open(resFile))
|
| 330 |
+
elif type(resFile) == np.ndarray:
|
| 331 |
+
anns = self.loadNumpyAnnotations(resFile)
|
| 332 |
+
else:
|
| 333 |
+
anns = resFile
|
| 334 |
+
assert type(anns) == list, 'results in not an array of objects'
|
| 335 |
+
annsImgIds = [ann['image_id'] for ann in anns]
|
| 336 |
+
assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
|
| 337 |
+
'Results do not correspond to current coco set'
|
| 338 |
+
if 'caption' in anns[0]:
|
| 339 |
+
imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
|
| 340 |
+
res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
|
| 341 |
+
for id, ann in enumerate(anns):
|
| 342 |
+
ann['id'] = id+1
|
| 343 |
+
elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
|
| 344 |
+
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
|
| 345 |
+
for id, ann in enumerate(anns):
|
| 346 |
+
bb = ann['bbox']
|
| 347 |
+
x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
|
| 348 |
+
if not 'segmentation' in ann:
|
| 349 |
+
ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
|
| 350 |
+
ann['area'] = bb[2]*bb[3]
|
| 351 |
+
ann['id'] = id+1
|
| 352 |
+
ann['iscrowd'] = 0
|
| 353 |
+
elif 'segmentation' in anns[0]:
|
| 354 |
+
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
|
| 355 |
+
for id, ann in enumerate(anns):
|
| 356 |
+
# now only support compressed RLE format as segmentation results
|
| 357 |
+
ann['area'] = maskUtils.area(ann['segmentation'])
|
| 358 |
+
if not 'bbox' in ann:
|
| 359 |
+
ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
|
| 360 |
+
ann['id'] = id+1
|
| 361 |
+
ann['iscrowd'] = 0
|
| 362 |
+
elif 'keypoints' in anns[0]:
|
| 363 |
+
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
|
| 364 |
+
for id, ann in enumerate(anns):
|
| 365 |
+
s = ann['keypoints']
|
| 366 |
+
x = s[0::3]
|
| 367 |
+
y = s[1::3]
|
| 368 |
+
x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
|
| 369 |
+
ann['area'] = (x1-x0)*(y1-y0)
|
| 370 |
+
ann['id'] = id + 1
|
| 371 |
+
ann['bbox'] = [x0,y0,x1-x0,y1-y0]
|
| 372 |
+
print('DONE (t={:0.2f}s)'.format(time.time()- tic))
|
| 373 |
+
|
| 374 |
+
res.dataset['annotations'] = anns
|
| 375 |
+
res.createIndex()
|
| 376 |
+
return res
|
| 377 |
+
|
| 378 |
+
def download(self, tarDir = None, imgIds = [] ):
|
| 379 |
+
'''
|
| 380 |
+
Download COCO images from mscoco.org server.
|
| 381 |
+
:param tarDir (str): COCO results directory name
|
| 382 |
+
imgIds (list): images to be downloaded
|
| 383 |
+
:return:
|
| 384 |
+
'''
|
| 385 |
+
if tarDir is None:
|
| 386 |
+
print('Please specify target directory')
|
| 387 |
+
return -1
|
| 388 |
+
if len(imgIds) == 0:
|
| 389 |
+
imgs = self.imgs.values()
|
| 390 |
+
else:
|
| 391 |
+
imgs = self.loadImgs(imgIds)
|
| 392 |
+
N = len(imgs)
|
| 393 |
+
if not os.path.exists(tarDir):
|
| 394 |
+
os.makedirs(tarDir)
|
| 395 |
+
for i, img in enumerate(imgs):
|
| 396 |
+
tic = time.time()
|
| 397 |
+
fname = os.path.join(tarDir, img['file_name'])
|
| 398 |
+
if not os.path.exists(fname):
|
| 399 |
+
urlretrieve(img['coco_url'], fname)
|
| 400 |
+
print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
|
| 401 |
+
|
| 402 |
+
def loadNumpyAnnotations(self, data):
|
| 403 |
+
"""
|
| 404 |
+
Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
|
| 405 |
+
:param data (numpy.ndarray)
|
| 406 |
+
:return: annotations (python nested list)
|
| 407 |
+
"""
|
| 408 |
+
print('Converting ndarray to lists...')
|
| 409 |
+
assert(type(data) == np.ndarray)
|
| 410 |
+
print(data.shape)
|
| 411 |
+
assert(data.shape[1] == 7)
|
| 412 |
+
N = data.shape[0]
|
| 413 |
+
ann = []
|
| 414 |
+
for i in range(N):
|
| 415 |
+
if i % 1000000 == 0:
|
| 416 |
+
print('{}/{}'.format(i,N))
|
| 417 |
+
ann += [{
|
| 418 |
+
'image_id' : int(data[i, 0]),
|
| 419 |
+
'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
|
| 420 |
+
'score' : data[i, 5],
|
| 421 |
+
'category_id': int(data[i, 6]),
|
| 422 |
+
}]
|
| 423 |
+
return ann
|
| 424 |
+
|
| 425 |
+
def annToRLE(self, ann):
|
| 426 |
+
"""
|
| 427 |
+
Convert annotation which can be polygons, uncompressed RLE to RLE.
|
| 428 |
+
:return: binary mask (numpy 2D array)
|
| 429 |
+
"""
|
| 430 |
+
t = self.imgs[ann['image_id']]
|
| 431 |
+
h, w = t['height'], t['width']
|
| 432 |
+
segm = ann['segmentation']
|
| 433 |
+
if type(segm) == list:
|
| 434 |
+
# polygon -- a single object might consist of multiple parts
|
| 435 |
+
# we merge all parts into one mask rle code
|
| 436 |
+
rles = maskUtils.frPyObjects(segm, h, w)
|
| 437 |
+
rle = maskUtils.merge(rles)
|
| 438 |
+
elif type(segm['counts']) == list:
|
| 439 |
+
# uncompressed RLE
|
| 440 |
+
rle = maskUtils.frPyObjects(segm, h, w)
|
| 441 |
+
else:
|
| 442 |
+
# rle
|
| 443 |
+
rle = ann['segmentation']
|
| 444 |
+
return rle
|
| 445 |
+
|
| 446 |
+
def annToMask(self, ann):
|
| 447 |
+
"""
|
| 448 |
+
Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
|
| 449 |
+
:return: binary mask (numpy 2D array)
|
| 450 |
+
"""
|
| 451 |
+
rle = self.annToRLE(ann)
|
| 452 |
+
m = maskUtils.decode(rle)
|
| 453 |
+
return m
|
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/cocoeval.py
ADDED
|
@@ -0,0 +1,534 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__author__ = 'tsungyi'
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
import datetime
|
| 5 |
+
import time
|
| 6 |
+
from collections import defaultdict
|
| 7 |
+
from . import mask as maskUtils
|
| 8 |
+
import copy
|
| 9 |
+
|
| 10 |
+
class COCOeval:
|
| 11 |
+
# Interface for evaluating detection on the Microsoft COCO dataset.
|
| 12 |
+
#
|
| 13 |
+
# The usage for CocoEval is as follows:
|
| 14 |
+
# cocoGt=..., cocoDt=... # load dataset and results
|
| 15 |
+
# E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
|
| 16 |
+
# E.params.recThrs = ...; # set parameters as desired
|
| 17 |
+
# E.evaluate(); # run per image evaluation
|
| 18 |
+
# E.accumulate(); # accumulate per image results
|
| 19 |
+
# E.summarize(); # display summary metrics of results
|
| 20 |
+
# For example usage see evalDemo.m and http://mscoco.org/.
|
| 21 |
+
#
|
| 22 |
+
# The evaluation parameters are as follows (defaults in brackets):
|
| 23 |
+
# imgIds - [all] N img ids to use for evaluation
|
| 24 |
+
# catIds - [all] K cat ids to use for evaluation
|
| 25 |
+
# iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation
|
| 26 |
+
# recThrs - [0:.01:1] R=101 recall thresholds for evaluation
|
| 27 |
+
# areaRng - [...] A=4 object area ranges for evaluation
|
| 28 |
+
# maxDets - [1 10 100] M=3 thresholds on max detections per image
|
| 29 |
+
# iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
|
| 30 |
+
# iouType replaced the now DEPRECATED useSegm parameter.
|
| 31 |
+
# useCats - [1] if true use category labels for evaluation
|
| 32 |
+
# Note: if useCats=0 category labels are ignored as in proposal scoring.
|
| 33 |
+
# Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
|
| 34 |
+
#
|
| 35 |
+
# evaluate(): evaluates detections on every image and every category and
|
| 36 |
+
# concats the results into the "evalImgs" with fields:
|
| 37 |
+
# dtIds - [1xD] id for each of the D detections (dt)
|
| 38 |
+
# gtIds - [1xG] id for each of the G ground truths (gt)
|
| 39 |
+
# dtMatches - [TxD] matching gt id at each IoU or 0
|
| 40 |
+
# gtMatches - [TxG] matching dt id at each IoU or 0
|
| 41 |
+
# dtScores - [1xD] confidence of each dt
|
| 42 |
+
# gtIgnore - [1xG] ignore flag for each gt
|
| 43 |
+
# dtIgnore - [TxD] ignore flag for each dt at each IoU
|
| 44 |
+
#
|
| 45 |
+
# accumulate(): accumulates the per-image, per-category evaluation
|
| 46 |
+
# results in "evalImgs" into the dictionary "eval" with fields:
|
| 47 |
+
# params - parameters used for evaluation
|
| 48 |
+
# date - date evaluation was performed
|
| 49 |
+
# counts - [T,R,K,A,M] parameter dimensions (see above)
|
| 50 |
+
# precision - [TxRxKxAxM] precision for every evaluation setting
|
| 51 |
+
# recall - [TxKxAxM] max recall for every evaluation setting
|
| 52 |
+
# Note: precision and recall==-1 for settings with no gt objects.
|
| 53 |
+
#
|
| 54 |
+
# See also coco, mask, pycocoDemo, pycocoEvalDemo
|
| 55 |
+
#
|
| 56 |
+
# Microsoft COCO Toolbox. version 2.0
|
| 57 |
+
# Data, paper, and tutorials available at: http://mscoco.org/
|
| 58 |
+
# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
|
| 59 |
+
# Licensed under the Simplified BSD License [see coco/license.txt]
|
| 60 |
+
def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
|
| 61 |
+
'''
|
| 62 |
+
Initialize CocoEval using coco APIs for gt and dt
|
| 63 |
+
:param cocoGt: coco object with ground truth annotations
|
| 64 |
+
:param cocoDt: coco object with detection results
|
| 65 |
+
:return: None
|
| 66 |
+
'''
|
| 67 |
+
if not iouType:
|
| 68 |
+
print('iouType not specified. use default iouType segm')
|
| 69 |
+
self.cocoGt = cocoGt # ground truth COCO API
|
| 70 |
+
self.cocoDt = cocoDt # detections COCO API
|
| 71 |
+
self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements
|
| 72 |
+
self.eval = {} # accumulated evaluation results
|
| 73 |
+
self._gts = defaultdict(list) # gt for evaluation
|
| 74 |
+
self._dts = defaultdict(list) # dt for evaluation
|
| 75 |
+
self.params = Params(iouType=iouType) # parameters
|
| 76 |
+
self._paramsEval = {} # parameters for evaluation
|
| 77 |
+
self.stats = [] # result summarization
|
| 78 |
+
self.ious = {} # ious between all gts and dts
|
| 79 |
+
if not cocoGt is None:
|
| 80 |
+
self.params.imgIds = sorted(cocoGt.getImgIds())
|
| 81 |
+
self.params.catIds = sorted(cocoGt.getCatIds())
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _prepare(self):
|
| 85 |
+
'''
|
| 86 |
+
Prepare ._gts and ._dts for evaluation based on params
|
| 87 |
+
:return: None
|
| 88 |
+
'''
|
| 89 |
+
def _toMask(anns, coco):
|
| 90 |
+
# modify ann['segmentation'] by reference
|
| 91 |
+
for ann in anns:
|
| 92 |
+
rle = coco.annToRLE(ann)
|
| 93 |
+
ann['segmentation'] = rle
|
| 94 |
+
p = self.params
|
| 95 |
+
if p.useCats:
|
| 96 |
+
gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
|
| 97 |
+
dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
|
| 98 |
+
else:
|
| 99 |
+
gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
|
| 100 |
+
dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
|
| 101 |
+
|
| 102 |
+
# convert ground truth to mask if iouType == 'segm'
|
| 103 |
+
if p.iouType == 'segm':
|
| 104 |
+
_toMask(gts, self.cocoGt)
|
| 105 |
+
_toMask(dts, self.cocoDt)
|
| 106 |
+
# set ignore flag
|
| 107 |
+
for gt in gts:
|
| 108 |
+
gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
|
| 109 |
+
gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
|
| 110 |
+
if p.iouType == 'keypoints':
|
| 111 |
+
gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
|
| 112 |
+
self._gts = defaultdict(list) # gt for evaluation
|
| 113 |
+
self._dts = defaultdict(list) # dt for evaluation
|
| 114 |
+
for gt in gts:
|
| 115 |
+
self._gts[gt['image_id'], gt['category_id']].append(gt)
|
| 116 |
+
for dt in dts:
|
| 117 |
+
self._dts[dt['image_id'], dt['category_id']].append(dt)
|
| 118 |
+
self.evalImgs = defaultdict(list) # per-image per-category evaluation results
|
| 119 |
+
self.eval = {} # accumulated evaluation results
|
| 120 |
+
|
| 121 |
+
def evaluate(self):
|
| 122 |
+
'''
|
| 123 |
+
Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
|
| 124 |
+
:return: None
|
| 125 |
+
'''
|
| 126 |
+
tic = time.time()
|
| 127 |
+
print('Running per image evaluation...')
|
| 128 |
+
p = self.params
|
| 129 |
+
# add backward compatibility if useSegm is specified in params
|
| 130 |
+
if not p.useSegm is None:
|
| 131 |
+
p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
|
| 132 |
+
print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
|
| 133 |
+
print('Evaluate annotation type *{}*'.format(p.iouType))
|
| 134 |
+
p.imgIds = list(np.unique(p.imgIds))
|
| 135 |
+
if p.useCats:
|
| 136 |
+
p.catIds = list(np.unique(p.catIds))
|
| 137 |
+
p.maxDets = sorted(p.maxDets)
|
| 138 |
+
self.params=p
|
| 139 |
+
|
| 140 |
+
self._prepare()
|
| 141 |
+
# loop through images, area range, max detection number
|
| 142 |
+
catIds = p.catIds if p.useCats else [-1]
|
| 143 |
+
|
| 144 |
+
if p.iouType == 'segm' or p.iouType == 'bbox':
|
| 145 |
+
computeIoU = self.computeIoU
|
| 146 |
+
elif p.iouType == 'keypoints':
|
| 147 |
+
computeIoU = self.computeOks
|
| 148 |
+
self.ious = {(imgId, catId): computeIoU(imgId, catId) \
|
| 149 |
+
for imgId in p.imgIds
|
| 150 |
+
for catId in catIds}
|
| 151 |
+
|
| 152 |
+
evaluateImg = self.evaluateImg
|
| 153 |
+
maxDet = p.maxDets[-1]
|
| 154 |
+
self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet)
|
| 155 |
+
for catId in catIds
|
| 156 |
+
for areaRng in p.areaRng
|
| 157 |
+
for imgId in p.imgIds
|
| 158 |
+
]
|
| 159 |
+
self._paramsEval = copy.deepcopy(self.params)
|
| 160 |
+
toc = time.time()
|
| 161 |
+
print('DONE (t={:0.2f}s).'.format(toc-tic))
|
| 162 |
+
|
| 163 |
+
def computeIoU(self, imgId, catId):
|
| 164 |
+
p = self.params
|
| 165 |
+
if p.useCats:
|
| 166 |
+
gt = self._gts[imgId,catId]
|
| 167 |
+
dt = self._dts[imgId,catId]
|
| 168 |
+
else:
|
| 169 |
+
gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
|
| 170 |
+
dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
|
| 171 |
+
if len(gt) == 0 and len(dt) ==0:
|
| 172 |
+
return []
|
| 173 |
+
inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
|
| 174 |
+
dt = [dt[i] for i in inds]
|
| 175 |
+
if len(dt) > p.maxDets[-1]:
|
| 176 |
+
dt=dt[0:p.maxDets[-1]]
|
| 177 |
+
|
| 178 |
+
if p.iouType == 'segm':
|
| 179 |
+
g = [g['segmentation'] for g in gt]
|
| 180 |
+
d = [d['segmentation'] for d in dt]
|
| 181 |
+
elif p.iouType == 'bbox':
|
| 182 |
+
g = [g['bbox'] for g in gt]
|
| 183 |
+
d = [d['bbox'] for d in dt]
|
| 184 |
+
else:
|
| 185 |
+
raise Exception('unknown iouType for iou computation')
|
| 186 |
+
|
| 187 |
+
# compute iou between each dt and gt region
|
| 188 |
+
iscrowd = [int(o['iscrowd']) for o in gt]
|
| 189 |
+
ious = maskUtils.iou(d,g,iscrowd)
|
| 190 |
+
return ious
|
| 191 |
+
|
| 192 |
+
def computeOks(self, imgId, catId):
|
| 193 |
+
p = self.params
|
| 194 |
+
# dimention here should be Nxm
|
| 195 |
+
gts = self._gts[imgId, catId]
|
| 196 |
+
dts = self._dts[imgId, catId]
|
| 197 |
+
inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
|
| 198 |
+
dts = [dts[i] for i in inds]
|
| 199 |
+
if len(dts) > p.maxDets[-1]:
|
| 200 |
+
dts = dts[0:p.maxDets[-1]]
|
| 201 |
+
# if len(gts) == 0 and len(dts) == 0:
|
| 202 |
+
if len(gts) == 0 or len(dts) == 0:
|
| 203 |
+
return []
|
| 204 |
+
ious = np.zeros((len(dts), len(gts)))
|
| 205 |
+
sigmas = p.kpt_oks_sigmas
|
| 206 |
+
vars = (sigmas * 2)**2
|
| 207 |
+
k = len(sigmas)
|
| 208 |
+
# compute oks between each detection and ground truth object
|
| 209 |
+
for j, gt in enumerate(gts):
|
| 210 |
+
# create bounds for ignore regions(double the gt bbox)
|
| 211 |
+
g = np.array(gt['keypoints'])
|
| 212 |
+
xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
|
| 213 |
+
k1 = np.count_nonzero(vg > 0)
|
| 214 |
+
bb = gt['bbox']
|
| 215 |
+
x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
|
| 216 |
+
y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
|
| 217 |
+
for i, dt in enumerate(dts):
|
| 218 |
+
d = np.array(dt['keypoints'])
|
| 219 |
+
xd = d[0::3]; yd = d[1::3]
|
| 220 |
+
if k1>0:
|
| 221 |
+
# measure the per-keypoint distance if keypoints visible
|
| 222 |
+
dx = xd - xg
|
| 223 |
+
dy = yd - yg
|
| 224 |
+
else:
|
| 225 |
+
# measure minimum distance to keypoints in (x0,y0) & (x1,y1)
|
| 226 |
+
z = np.zeros((k))
|
| 227 |
+
dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
|
| 228 |
+
dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)
|
| 229 |
+
e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2
|
| 230 |
+
if k1 > 0:
|
| 231 |
+
e=e[vg > 0]
|
| 232 |
+
ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
|
| 233 |
+
return ious
|
| 234 |
+
|
| 235 |
+
def evaluateImg(self, imgId, catId, aRng, maxDet):
|
| 236 |
+
'''
|
| 237 |
+
perform evaluation for single category and image
|
| 238 |
+
:return: dict (single image results)
|
| 239 |
+
'''
|
| 240 |
+
p = self.params
|
| 241 |
+
if p.useCats:
|
| 242 |
+
gt = self._gts[imgId,catId]
|
| 243 |
+
dt = self._dts[imgId,catId]
|
| 244 |
+
else:
|
| 245 |
+
gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
|
| 246 |
+
dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
|
| 247 |
+
if len(gt) == 0 and len(dt) ==0:
|
| 248 |
+
return None
|
| 249 |
+
|
| 250 |
+
for g in gt:
|
| 251 |
+
if g['ignore'] or (g['area']<aRng[0] or g['area']>aRng[1]):
|
| 252 |
+
g['_ignore'] = 1
|
| 253 |
+
else:
|
| 254 |
+
g['_ignore'] = 0
|
| 255 |
+
|
| 256 |
+
# sort dt highest score first, sort gt ignore last
|
| 257 |
+
gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
|
| 258 |
+
gt = [gt[i] for i in gtind]
|
| 259 |
+
dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
|
| 260 |
+
dt = [dt[i] for i in dtind[0:maxDet]]
|
| 261 |
+
iscrowd = [int(o['iscrowd']) for o in gt]
|
| 262 |
+
# load computed ious
|
| 263 |
+
ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId]
|
| 264 |
+
|
| 265 |
+
T = len(p.iouThrs)
|
| 266 |
+
G = len(gt)
|
| 267 |
+
D = len(dt)
|
| 268 |
+
gtm = np.zeros((T,G))
|
| 269 |
+
dtm = np.zeros((T,D))
|
| 270 |
+
gtIg = np.array([g['_ignore'] for g in gt])
|
| 271 |
+
dtIg = np.zeros((T,D))
|
| 272 |
+
if not len(ious)==0:
|
| 273 |
+
for tind, t in enumerate(p.iouThrs):
|
| 274 |
+
for dind, d in enumerate(dt):
|
| 275 |
+
# information about best match so far (m=-1 -> unmatched)
|
| 276 |
+
iou = min([t,1-1e-10])
|
| 277 |
+
m = -1
|
| 278 |
+
for gind, g in enumerate(gt):
|
| 279 |
+
# if this gt already matched, and not a crowd, continue
|
| 280 |
+
if gtm[tind,gind]>0 and not iscrowd[gind]:
|
| 281 |
+
continue
|
| 282 |
+
# if dt matched to reg gt, and on ignore gt, stop
|
| 283 |
+
if m>-1 and gtIg[m]==0 and gtIg[gind]==1:
|
| 284 |
+
break
|
| 285 |
+
# continue to next gt unless better match made
|
| 286 |
+
if ious[dind,gind] < iou:
|
| 287 |
+
continue
|
| 288 |
+
# if match successful and best so far, store appropriately
|
| 289 |
+
iou=ious[dind,gind]
|
| 290 |
+
m=gind
|
| 291 |
+
# if match made store id of match for both dt and gt
|
| 292 |
+
if m ==-1:
|
| 293 |
+
continue
|
| 294 |
+
dtIg[tind,dind] = gtIg[m]
|
| 295 |
+
dtm[tind,dind] = gt[m]['id']
|
| 296 |
+
gtm[tind,m] = d['id']
|
| 297 |
+
# set unmatched detections outside of area range to ignore
|
| 298 |
+
a = np.array([d['area']<aRng[0] or d['area']>aRng[1] for d in dt]).reshape((1, len(dt)))
|
| 299 |
+
dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0)))
|
| 300 |
+
# store results for given image and category
|
| 301 |
+
return {
|
| 302 |
+
'image_id': imgId,
|
| 303 |
+
'category_id': catId,
|
| 304 |
+
'aRng': aRng,
|
| 305 |
+
'maxDet': maxDet,
|
| 306 |
+
'dtIds': [d['id'] for d in dt],
|
| 307 |
+
'gtIds': [g['id'] for g in gt],
|
| 308 |
+
'dtMatches': dtm,
|
| 309 |
+
'gtMatches': gtm,
|
| 310 |
+
'dtScores': [d['score'] for d in dt],
|
| 311 |
+
'gtIgnore': gtIg,
|
| 312 |
+
'dtIgnore': dtIg,
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
def accumulate(self, p = None):
|
| 316 |
+
'''
|
| 317 |
+
Accumulate per image evaluation results and store the result in self.eval
|
| 318 |
+
:param p: input params for evaluation
|
| 319 |
+
:return: None
|
| 320 |
+
'''
|
| 321 |
+
print('Accumulating evaluation results...')
|
| 322 |
+
tic = time.time()
|
| 323 |
+
if not self.evalImgs:
|
| 324 |
+
print('Please run evaluate() first')
|
| 325 |
+
# allows input customized parameters
|
| 326 |
+
if p is None:
|
| 327 |
+
p = self.params
|
| 328 |
+
p.catIds = p.catIds if p.useCats == 1 else [-1]
|
| 329 |
+
T = len(p.iouThrs)
|
| 330 |
+
R = len(p.recThrs)
|
| 331 |
+
K = len(p.catIds) if p.useCats else 1
|
| 332 |
+
A = len(p.areaRng)
|
| 333 |
+
M = len(p.maxDets)
|
| 334 |
+
precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
|
| 335 |
+
recall = -np.ones((T,K,A,M))
|
| 336 |
+
scores = -np.ones((T,R,K,A,M))
|
| 337 |
+
|
| 338 |
+
# create dictionary for future indexing
|
| 339 |
+
_pe = self._paramsEval
|
| 340 |
+
catIds = _pe.catIds if _pe.useCats else [-1]
|
| 341 |
+
setK = set(catIds)
|
| 342 |
+
setA = set(map(tuple, _pe.areaRng))
|
| 343 |
+
setM = set(_pe.maxDets)
|
| 344 |
+
setI = set(_pe.imgIds)
|
| 345 |
+
# get inds to evaluate
|
| 346 |
+
k_list = [n for n, k in enumerate(p.catIds) if k in setK]
|
| 347 |
+
m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
|
| 348 |
+
a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
|
| 349 |
+
i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
|
| 350 |
+
I0 = len(_pe.imgIds)
|
| 351 |
+
A0 = len(_pe.areaRng)
|
| 352 |
+
# retrieve E at each category, area range, and max number of detections
|
| 353 |
+
for k, k0 in enumerate(k_list):
|
| 354 |
+
Nk = k0*A0*I0
|
| 355 |
+
for a, a0 in enumerate(a_list):
|
| 356 |
+
Na = a0*I0
|
| 357 |
+
for m, maxDet in enumerate(m_list):
|
| 358 |
+
E = [self.evalImgs[Nk + Na + i] for i in i_list]
|
| 359 |
+
E = [e for e in E if not e is None]
|
| 360 |
+
if len(E) == 0:
|
| 361 |
+
continue
|
| 362 |
+
dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
|
| 363 |
+
|
| 364 |
+
# different sorting method generates slightly different results.
|
| 365 |
+
# mergesort is used to be consistent as Matlab implementation.
|
| 366 |
+
inds = np.argsort(-dtScores, kind='mergesort')
|
| 367 |
+
dtScoresSorted = dtScores[inds]
|
| 368 |
+
|
| 369 |
+
dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
|
| 370 |
+
dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds]
|
| 371 |
+
gtIg = np.concatenate([e['gtIgnore'] for e in E])
|
| 372 |
+
npig = np.count_nonzero(gtIg==0 )
|
| 373 |
+
if npig == 0:
|
| 374 |
+
continue
|
| 375 |
+
tps = np.logical_and( dtm, np.logical_not(dtIg) )
|
| 376 |
+
fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
|
| 377 |
+
|
| 378 |
+
tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
|
| 379 |
+
fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
|
| 380 |
+
for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
|
| 381 |
+
tp = np.array(tp)
|
| 382 |
+
fp = np.array(fp)
|
| 383 |
+
nd = len(tp)
|
| 384 |
+
rc = tp / npig
|
| 385 |
+
pr = tp / (fp+tp+np.spacing(1))
|
| 386 |
+
q = np.zeros((R,))
|
| 387 |
+
ss = np.zeros((R,))
|
| 388 |
+
|
| 389 |
+
if nd:
|
| 390 |
+
recall[t,k,a,m] = rc[-1]
|
| 391 |
+
else:
|
| 392 |
+
recall[t,k,a,m] = 0
|
| 393 |
+
|
| 394 |
+
# numpy is slow without cython optimization for accessing elements
|
| 395 |
+
# use python array gets significant speed improvement
|
| 396 |
+
pr = pr.tolist(); q = q.tolist()
|
| 397 |
+
|
| 398 |
+
for i in range(nd-1, 0, -1):
|
| 399 |
+
if pr[i] > pr[i-1]:
|
| 400 |
+
pr[i-1] = pr[i]
|
| 401 |
+
|
| 402 |
+
inds = np.searchsorted(rc, p.recThrs, side='left')
|
| 403 |
+
try:
|
| 404 |
+
for ri, pi in enumerate(inds):
|
| 405 |
+
q[ri] = pr[pi]
|
| 406 |
+
ss[ri] = dtScoresSorted[pi]
|
| 407 |
+
except:
|
| 408 |
+
pass
|
| 409 |
+
precision[t,:,k,a,m] = np.array(q)
|
| 410 |
+
scores[t,:,k,a,m] = np.array(ss)
|
| 411 |
+
self.eval = {
|
| 412 |
+
'params': p,
|
| 413 |
+
'counts': [T, R, K, A, M],
|
| 414 |
+
'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
| 415 |
+
'precision': precision,
|
| 416 |
+
'recall': recall,
|
| 417 |
+
'scores': scores,
|
| 418 |
+
}
|
| 419 |
+
toc = time.time()
|
| 420 |
+
print('DONE (t={:0.2f}s).'.format( toc-tic))
|
| 421 |
+
|
| 422 |
+
def summarize(self):
|
| 423 |
+
'''
|
| 424 |
+
Compute and display summary metrics for evaluation results.
|
| 425 |
+
Note this functin can *only* be applied on the default parameter setting
|
| 426 |
+
'''
|
| 427 |
+
def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
|
| 428 |
+
p = self.params
|
| 429 |
+
iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
|
| 430 |
+
titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
|
| 431 |
+
typeStr = '(AP)' if ap==1 else '(AR)'
|
| 432 |
+
iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
|
| 433 |
+
if iouThr is None else '{:0.2f}'.format(iouThr)
|
| 434 |
+
|
| 435 |
+
aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
|
| 436 |
+
mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
|
| 437 |
+
if ap == 1:
|
| 438 |
+
# dimension of precision: [TxRxKxAxM]
|
| 439 |
+
s = self.eval['precision']
|
| 440 |
+
# IoU
|
| 441 |
+
if iouThr is not None:
|
| 442 |
+
t = np.where(iouThr == p.iouThrs)[0]
|
| 443 |
+
s = s[t]
|
| 444 |
+
s = s[:,:,:,aind,mind]
|
| 445 |
+
else:
|
| 446 |
+
# dimension of recall: [TxKxAxM]
|
| 447 |
+
s = self.eval['recall']
|
| 448 |
+
if iouThr is not None:
|
| 449 |
+
t = np.where(iouThr == p.iouThrs)[0]
|
| 450 |
+
s = s[t]
|
| 451 |
+
s = s[:,:,aind,mind]
|
| 452 |
+
if len(s[s>-1])==0:
|
| 453 |
+
mean_s = -1
|
| 454 |
+
else:
|
| 455 |
+
mean_s = np.mean(s[s>-1])
|
| 456 |
+
print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
|
| 457 |
+
return mean_s
|
| 458 |
+
def _summarizeDets():
|
| 459 |
+
stats = np.zeros((12,))
|
| 460 |
+
stats[0] = _summarize(1)
|
| 461 |
+
stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
|
| 462 |
+
stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
|
| 463 |
+
stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
|
| 464 |
+
stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
|
| 465 |
+
stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
|
| 466 |
+
stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
|
| 467 |
+
stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
|
| 468 |
+
stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
|
| 469 |
+
stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
|
| 470 |
+
stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
|
| 471 |
+
stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
|
| 472 |
+
return stats
|
| 473 |
+
def _summarizeKps():
|
| 474 |
+
stats = np.zeros((10,))
|
| 475 |
+
stats[0] = _summarize(1, maxDets=20)
|
| 476 |
+
stats[1] = _summarize(1, maxDets=20, iouThr=.5)
|
| 477 |
+
stats[2] = _summarize(1, maxDets=20, iouThr=.75)
|
| 478 |
+
stats[3] = _summarize(1, maxDets=20, areaRng='medium')
|
| 479 |
+
stats[4] = _summarize(1, maxDets=20, areaRng='large')
|
| 480 |
+
stats[5] = _summarize(0, maxDets=20)
|
| 481 |
+
stats[6] = _summarize(0, maxDets=20, iouThr=.5)
|
| 482 |
+
stats[7] = _summarize(0, maxDets=20, iouThr=.75)
|
| 483 |
+
stats[8] = _summarize(0, maxDets=20, areaRng='medium')
|
| 484 |
+
stats[9] = _summarize(0, maxDets=20, areaRng='large')
|
| 485 |
+
return stats
|
| 486 |
+
if not self.eval:
|
| 487 |
+
raise Exception('Please run accumulate() first')
|
| 488 |
+
iouType = self.params.iouType
|
| 489 |
+
if iouType == 'segm' or iouType == 'bbox':
|
| 490 |
+
summarize = _summarizeDets
|
| 491 |
+
elif iouType == 'keypoints':
|
| 492 |
+
summarize = _summarizeKps
|
| 493 |
+
self.stats = summarize()
|
| 494 |
+
|
| 495 |
+
def __str__(self):
|
| 496 |
+
self.summarize()
|
| 497 |
+
|
| 498 |
+
class Params:
|
| 499 |
+
'''
|
| 500 |
+
Params for coco evaluation api
|
| 501 |
+
'''
|
| 502 |
+
def setDetParams(self):
|
| 503 |
+
self.imgIds = []
|
| 504 |
+
self.catIds = []
|
| 505 |
+
# np.arange causes trouble. the data point on arange is slightly larger than the true value
|
| 506 |
+
self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
|
| 507 |
+
self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
|
| 508 |
+
self.maxDets = [1, 10, 100]
|
| 509 |
+
self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
|
| 510 |
+
self.areaRngLbl = ['all', 'small', 'medium', 'large']
|
| 511 |
+
self.useCats = 1
|
| 512 |
+
|
| 513 |
+
def setKpParams(self):
|
| 514 |
+
self.imgIds = []
|
| 515 |
+
self.catIds = []
|
| 516 |
+
# np.arange causes trouble. the data point on arange is slightly larger than the true value
|
| 517 |
+
self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
|
| 518 |
+
self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
|
| 519 |
+
self.maxDets = [20]
|
| 520 |
+
self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
|
| 521 |
+
self.areaRngLbl = ['all', 'medium', 'large']
|
| 522 |
+
self.useCats = 1
|
| 523 |
+
self.kpt_oks_sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
|
| 524 |
+
|
| 525 |
+
def __init__(self, iouType='segm'):
|
| 526 |
+
if iouType == 'segm' or iouType == 'bbox':
|
| 527 |
+
self.setDetParams()
|
| 528 |
+
elif iouType == 'keypoints':
|
| 529 |
+
self.setKpParams()
|
| 530 |
+
else:
|
| 531 |
+
raise Exception('iouType not supported')
|
| 532 |
+
self.iouType = iouType
|
| 533 |
+
# useSegm is deprecated
|
| 534 |
+
self.useSegm = None
|