Spaces:

bulatko
/

zoo3d

Sleeping

App Files Files Community

drozdgk commited on Dec 12, 2025

Commit

352cafd

1 Parent(s): daaac94

chore: vendor third_party (remove submodules, ignore artifacts)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +16 -0
MaskClustering/third_party/Entity +0 -1
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_baseline.yaml +40 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b0_1x.yaml +43 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b5_1x.yaml +43 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_1x.yaml +40 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_3x.yaml +40 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_dcnv2_3x.yaml +42 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_1x.yaml +40 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_3x.yaml +40 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_1x.yaml +51 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_3x.yaml +50 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_t_1x.yaml +51 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/demo_result_and_vis.py +172 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/__init__.py +5 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/arch.py +298 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/__init__.py +2 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/mixvision.py +464 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/swin.py +723 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/config.py +102 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/__init__.py +0 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/detection.py +112 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/__init__.py +4 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/conv_with_kaiming_uniform.py +52 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/deform_conv.py +111 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/iou_loss.py +54 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/ml_nms.py +26 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/outputs.py +489 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/tower.py +100 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/__init__.py +2 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/comm.py +52 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/measures.py +191 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/__init__.py +0 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/entity_evaluation.py +523 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/__init__.py +2 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/dynamic_mask_head.py +303 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/mask_branch.py +71 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/utils.py +53 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/__init__.py +0 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/deformable_conv_with_off.py +59 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/panopticfcn_head.py +190 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/entity_to_json.py +123 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.py +119 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.sh +8 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/Makefile +9 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/__init__.py +1 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.c +0 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.pyx +308 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/coco.py +453 -0
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/cocoeval.py +534 -0

.gitignore CHANGED Viewed

@@ -154,3 +154,19 @@ temp/
 **/*.bin
 data/
 **/*.pth

 **/*.bin
 data/
 **/*.pth
+# macOS junk
+.DS_Store
+**/.DS_Store
+# Don't commit build artifacts / compiled binaries from third_party
+MaskClustering/third_party/**/__pycache__/
+MaskClustering/third_party/**/*.pyc
+MaskClustering/third_party/**/*.pyo
+MaskClustering/third_party/**/build/
+MaskClustering/third_party/**/dist/
+MaskClustering/third_party/**/*.o
+MaskClustering/third_party/**/*.so
+# HF Hub limit: keep large docs assets out of git
+MaskClustering/third_party/Entity/Entityv2/figures/teaser_mosaic_low.png

MaskClustering/third_party/Entity DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 6e7e13ac91ef508088e1b848167c01f19b00b512

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_baseline.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    DEPTH: 50
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  WEIGHTS: "pretrained_model/R-50.pkl"
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.0
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  CHECKPOINT_PERIOD: 10000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b0_1x.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_mit_fpn_backbone"
+    FREEZE_AT: -1
+  MIT_BACKBONE:
+    NAME: "b0"
+  WEIGHTS: "pretrained_model/mit_b0_trans.pth"
+  FPN:
+    IN_FEATURES: ["mit1", "mit2", "mit3", "mit4"]
+    TOP_LEVELS: 2
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  OPTIMIZER: "adamw"
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  WEIGHT_DECAY: 0.05
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  CHECKPOINT_PERIOD: 20000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b5_1x.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_mit_fpn_backbone"
+    FREEZE_AT: -1
+  MIT_BACKBONE:
+    NAME: "b5"
+  WEIGHTS: "pretrained_model/mit_b5_trans.pth"
+  FPN:
+    IN_FEATURES: ["mit1", "mit2", "mit3", "mit4"]
+    TOP_LEVELS: 2
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 80
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  OPTIMIZER: "adamw"
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 8
+  BASE_LR: 0.0001
+  WEIGHT_DECAY: 0.05
+  STEPS: (120000, 160000)
+  MAX_ITER: 180000
+  CHECKPOINT_PERIOD: 20000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_1x.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    DEPTH: 101
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  WEIGHTS: "pretrained_model/R-101.pkl"
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  CHECKPOINT_PERIOD: 20000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_3x.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    DEPTH: 101
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  WEIGHTS: "pretrained_model/R-101.pkl"
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01
+  STEPS: (180000, 250000)
+  MAX_ITER: 270000
+  CHECKPOINT_PERIOD: 40000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_dcnv2_3x.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    DEPTH: 101
+    DEFORM_ON_PER_STAGE: [False, True, True, True]
+    DEFORM_MODULATED: True
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  WEIGHTS: "pretrained_model/R-101.pkl"
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01
+  STEPS: (180000, 250000)
+  MAX_ITER: 270000
+  CHECKPOINT_PERIOD: 40000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_1x.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    DEPTH: 50
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  WEIGHTS: "pretrained_model/R-50.pkl"
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  CHECKPOINT_PERIOD: 40000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_3x.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_resnet_fpn_backbone"
+  RESNETS:
+    DEPTH: 50
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  WEIGHTS: "pretrained_model/R-50.pkl"
+  FPN:
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.01
+  STEPS: (180000, 250000)
+  MAX_ITER: 270000
+  CHECKPOINT_PERIOD: 40000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_1x.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_swin_fpn_backbone"
+    FREEZE_AT: -1
+  SWINT:
+    EMBED_DIM: 192
+    PATCH_SIZE: 4
+    OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 7
+    MLP_RATIO: 4
+    DROP_PATH_RATE: 0.2
+    APE: False
+  WEIGHTS: "pretrained_model/swin_large_patch4_window7_224_22k_trans.pth"
+  FPN:
+    IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+    TOP_LEVELS: 2
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  OPTIMIZER: "adamw"
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  WEIGHT_DECAY: 0.05
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  CHECKPOINT_PERIOD: 20000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_3x.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_swin_fpn_backbone"
+    FREEZE_AT: -1
+  SWINT:
+    EMBED_DIM: 192
+    PATCH_SIZE: 4
+    OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 7
+    MLP_RATIO: 4
+    DROP_PATH_RATE: 0.2
+    APE: False
+  WEIGHTS: "pretrained_model/swin_large_patch4_window7_224_22k_trans.pth"
+  FPN:
+    IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+    TOP_LEVELS: 2
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  WEIGHT_DECAY: 0.05
+  STEPS: (180000, 250000)
+  MAX_ITER: 270000
+  CHECKPOINT_PERIOD: 40000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_t_1x.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+MODEL:
+  META_ARCHITECTURE: "EntityFPN"
+  MASK_ON: False
+  BACKBONE:
+    NAME: "build_retinanet_swin_fpn_backbone"
+    FREEZE_AT: -1
+  SWINT:
+    EMBED_DIM: 96
+    PATCH_SIZE: 4
+    OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+    DEPTHS: [2, 2, 6, 2]
+    NUM_HEADS: [3, 6, 12, 24]
+    WINDOW_SIZE: 7
+    MLP_RATIO: 4
+    DROP_PATH_RATE: 0.2
+    APE: False
+  WEIGHTS: "pretrained_model/swin_tiny_patch4_window7_224_trans.pth"
+  FPN:
+    IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
+    TOP_LEVELS: 2
+  FCOS:
+    NUM_CLASSES: 1
+  CONDINST:
+    CLASS_AGNOSTIC: True
+    TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
+    MASK_BRANCH:
+      SEMANTIC_LOSS_ON: False
+      IN_FEATURES: ["p3", "p4", "p5"]
+    MASK_HEAD:
+      CLUSTER_WEIGHT: 0.5
+      DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
+      DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
+DATASETS:
+  TRAIN: ("coco_2017_train_entity",)
+  TEST: ("coco_2017_val_entity",)
+SOLVER:
+  OPTIMIZER: "adamw"
+  WARMUP_ITERS: 1500
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  WEIGHT_DECAY: 0.05
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  CHECKPOINT_PERIOD: 20000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+INPUT:
+  MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
+VERSION: 2
+TEST:
+  CLASS_AGNOSTIC: True

MaskClustering/third_party/Entity/Entity/EntitySeg/demo_result_and_vis.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import argparse
+import glob
+import multiprocessing as mp
+import os
+import time
+import cv2
+import tqdm
+import numpy as np
+import copy
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+from detectron2.engine import default_setup
+from entityseg import *
+from predictor import VisualizationDemo
+import pdb
+# constants
+WINDOW_NAME = "Image Segmentation"
+def make_colors():
+    from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+    colors = []
+    for cate in COCO_CATEGORIES:
+        colors.append(cate["color"])
+    return colors
+def mask_to_boundary(mask, dilation_ratio=0.0008):
+	"""
+	Convert binary mask to boundary mask.
+	:param mask (numpy array, uint8): binary mask
+	:param dilation_ratio (float): ratio to calculate dilation = dilation_ratio * image_diagonal
+	:return: boundary mask (numpy array)
+	"""
+	h, w = mask.shape
+	img_diag = np.sqrt(h ** 2 + w ** 2)
+	dilation = int(round(dilation_ratio * img_diag))
+	if dilation < 1:
+	    dilation = 1
+	# Pad image so mask truncated by the image border is also considered as boundary.
+	new_mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=0)
+	kernel = np.ones((3, 3), dtype=np.uint8)
+	new_mask_erode = cv2.erode(new_mask, kernel, iterations=dilation)
+	mask_erode = new_mask_erode[1 : h + 1, 1 : w + 1]
+	# G_d intersects G in the paper.
+	return mask - mask_erode
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_entity_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    default_setup(cfg, args)
+    cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
+    cfg.freeze()
+    return cfg
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin models")
+    parser.add_argument(
+        "--config-file",
+        default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.2,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "opts",
+        help="Modify config options by adding 'KEY VALUE' pairs at the end of the command. "
+        "See config references at "
+        "https://detectron2.readthedocs.io/modules/config.html#config-references",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+    cfg = setup_cfg(args)
+    demo = VisualizationDemo(cfg)
+    colors = make_colors()
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            data = demo.run_on_image_wo_vis(img)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(data[0])),
+                    time.time() - start_time,
+                )
+            )
+            if os.path.isdir(args.output):
+                assert os.path.isdir(args.output), args.output
+                out_filename = os.path.join(args.output, os.path.basename(path))
+            else:
+                assert len(args.input) == 1, "Please specify a directory with args.output"
+                out_filename = args.output
+            ## save inference result, [0] original score by detection head, [1] mask rescoring score, [2] mask_id
+            ori_scores = data[0]
+            scores = data[1]
+            mask_id = data[2]
+            np.savez(out_filename.split(".")[0]+".npz", ori_scores=ori_scores, scores=scores, mask_id=mask_id)
+            ## save visualization
+            img_for_paste = copy.deepcopy(img)
+            color_mask     = copy.deepcopy(img)
+            masks_edge     = np.zeros(img.shape[:2], dtype=np.uint8)
+            alpha  = 0.4
+            count  = 0
+            for index, score in enumerate(scores):
+                if score <= args.confidence_threshold:
+                    break
+                color_mask[mask_id==count] = colors[count]
+                boundary = mask_to_boundary((mask_id==count).astype(np.uint8))
+                masks_edge[boundary>0] = 1
+                count += 1
+            img_wm = cv2.addWeighted(img_for_paste, alpha, color_mask, 1-alpha, 0)
+            img_wm[masks_edge==1] = 0
+            fvis = np.concatenate((img, img_wm))
+            cv2.imwrite(out_filename.split(".")[0]+".jpg",fvis)

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .arch import EntityFPN
+from .data import *
+from .config import add_entity_config
+from .evaluator.entity_evaluation import COCOEvaluator_ClassAgnostic
+from .backbone import build_retinanet_swin_fpn_backbone, build_retinanet_mit_fpn_backbone

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/arch.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# -*- coding: utf-8 -*-
+import logging
+import torch
+from torch import nn
+import torch.nn.functional as F
+from detectron2.structures import ImageList
+from detectron2.modeling.backbone import build_backbone
+from detectron2.modeling.postprocessing import detector_postprocess, sem_seg_postprocess
+from detectron2.modeling.proposal_generator import build_proposal_generator
+from detectron2.modeling.roi_heads import build_roi_heads
+from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from .det_head.detection import build_det_head
+from .det_head.utils.comm import aligned_bilinear
+from .mask_head.dynamic_mask_head import build_dynamic_mask_head
+from .mask_head.mask_branch import build_mask_branch
+from .panopticfcn_tools.panopticfcn_head import build_kernel_head
+from detectron2.structures import Instances, Boxes
+import random
+import pdb
+import copy
+logger = logging.getLogger(__name__)
+__all__ = ["ItemFPN"]
+@META_ARCH_REGISTRY.register()
+class EntityFPN(nn.Module):
+    """
+    Implement the paper :paper:`PanopticFPN`.
+    """
+    def __init__(self, cfg):
+        super().__init__()
+        self.device = torch.device(cfg.MODEL.DEVICE)
+        self.backbone  = build_backbone(cfg)
+        backbone_shape = self.backbone.output_shape()
+        self.det_head  = build_det_head(cfg, backbone_shape)
+        ## mask
+        self.mask_head = build_dynamic_mask_head(cfg)
+        self.mask_branch = build_mask_branch(cfg, self.backbone.output_shape())
+        self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
+        self.max_proposals   = cfg.MODEL.CONDINST.MAX_PROPOSALS
+        self.only_class_agnostic = cfg.MODEL.CONDINST.CLASS_AGNOSTIC
+        in_channels = self.det_head.in_channels_to_top_module
+        self.controller = build_kernel_head(cfg, self.mask_head.num_gen_params)
+        self.train_max_proposals_per_image = cfg.MODEL.CONDINST.TRAIN_MAX_PROPOSALS_PER_IMAGE
+        self.use_mask_rescore_infer = cfg.MODEL.CONDINST.MASK_BRANCH.USE_MASK_RESCORE
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.pixel_mean = pixel_mean
+        self.pixel_std  = pixel_std
+        self.to(self.device)
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                * "image": Tensor, image in (C, H, W) format.
+                * "instances": Instances
+                * "sem_seg": semantic segmentation ground truth.
+                * Other information that's included in the original dicts, such as:
+                  "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            list[dict]:
+                each dict is the results for one image. The dict contains the following keys:
+                * "instances": see :meth:`GeneralizedRCNN.forward` for its format.
+                * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
+                * "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
+                  See the return value of
+                  :func:`combine_semantic_and_instance_outputs` for its format.
+        """
+        # for x in batched_inputs:
+        #     print(x["file_name"])
+        images      = self.preprocess_image(batched_inputs)
+        features    = self.backbone(images.tensor)
+        if "instances" in batched_inputs[0] and self.training:
+            B = len(batched_inputs)
+            for i in range(B):
+                if self.only_class_agnostic:
+                    batched_inputs[i]["instances"].gt_classes[:] = 0
+                instance_map = batched_inputs[i]["instance_map"]
+                num_instances = int(torch.max(instance_map)+1)
+                instanceid = batched_inputs[i]["instances"].instanceid
+                gt_bitmasks_pad = F.one_hot(instance_map.long(), num_instances)[...,instanceid].permute((2,0,1))
+                pad_h, pad_w = images.tensor.size(-2), images.tensor.size(-1)
+                no_pad_h, no_pad_w = gt_bitmasks_pad.shape[1:]
+                padding_size = [0, pad_w - no_pad_w, 0, pad_h-no_pad_h]
+                gt_bitmasks_pad = F.pad(gt_bitmasks_pad, padding_size, value=0)
+                start = int(self.mask_out_stride // 2)
+                bitmask_full = gt_bitmasks_pad.clone()
+                bitmask  = gt_bitmasks_pad[:,start::self.mask_out_stride, start::self.mask_out_stride]
+                N = bitmask.shape[0]
+                batched_inputs[i]["instances"].gt_bitmasks = bitmask.int()
+                batched_inputs[i]["instances"].gt_bitmasks_full = bitmask_full.int()
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+        mask_feats = self.mask_branch(features, gt_instances)
+        proposals, proposal_losses = self.det_head(images, features, gt_instances, self.controller)
+        if self.training:
+            max_num_proposals = self.train_max_proposals_per_image * len(batched_inputs)
+            actual_num_proposals = len(proposals["instances"])
+            if actual_num_proposals >= max_num_proposals:
+                select = random.sample(list(range(actual_num_proposals)), max_num_proposals)
+                proposals["instances"] = proposals["instances"][select]
+            loss_masks = self._forward_mask_heads_train(proposals, mask_feats, gt_instances)
+            losses = {}
+            losses.update(proposal_losses)
+            losses.update(loss_masks)
+            return losses
+        else:
+            pred_instances_w_masks = self._forward_mask_heads_test(proposals, mask_feats)
+            padded_im_h, padded_im_w = images.tensor.size()[-2:]
+            processed_results = []
+            for im_id, (input_per_image, image_size) in enumerate(zip(batched_inputs, images.image_sizes)):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                instances_per_im = pred_instances_w_masks[pred_instances_w_masks.im_inds == im_id]
+                instances_per_im = self.postprocess(
+                    instances_per_im, height, width,
+                    padded_im_h, padded_im_w
+                )
+                processed_results.append({
+                    "instances": instances_per_im
+                })
+            return processed_results
+    def _forward_mask_heads_train(self, proposals, mask_feats, gt_instances):
+        # prepare the inputs for mask heads
+        pred_instances = proposals["instances"]
+        if 0 <= self.max_proposals < len(pred_instances):
+            inds = torch.randperm(len(pred_instances), device=mask_feats.device).long()
+            logger.info("clipping proposals from {} to {}".format(
+                len(pred_instances), self.max_proposals
+            ))
+            pred_instances = pred_instances[inds[:self.max_proposals]]
+        pred_instances.mask_head_params = pred_instances.top_feats
+        loss_masks = self.mask_head(
+            mask_feats, self.mask_branch.out_stride,
+            pred_instances, gt_instances
+        )
+        return loss_masks
+    def _forward_mask_heads_test(self, proposals, mask_feats):
+        # prepare the inputs for mask heads
+        for im_id, per_im in enumerate(proposals):
+            per_im.im_inds = per_im.locations.new_ones(len(per_im), dtype=torch.long) * im_id
+        pred_instances = Instances.cat(proposals)
+        pred_instances.mask_head_params = pred_instances.top_feat
+        pred_instances_w_masks = self.mask_head(mask_feats, self.mask_branch.out_stride, pred_instances)
+        return pred_instances_w_masks
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images, self.backbone.size_divisibility)
+        return images
+    def postprocess(self, results, output_height, output_width, padded_im_h, padded_im_w, mask_threshold=0.5):
+        """
+        Resize the output instances.
+        The input images are often resized when entering an object detector.
+        As a result, we often need the outputs of the detector in a different
+        resolution from its inputs.
+        This function will resize the raw outputs of an R-CNN detector
+        to produce outputs according to the desired output resolution.
+        Args:
+            results (Instances): the raw outputs from the detector.
+                `results.image_size` contains the input image resolution the detector sees.
+                This object might be modified in-place.
+            output_height, output_width: the desired output resolution.
+        Returns:
+            Instances: the resized output from the model, based on the output resolution
+        """
+        scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
+        resized_im_h, resized_im_w = results.image_size
+        results = Instances((output_height, output_width), **results.get_fields())
+        if results.has("pred_boxes"):
+            output_boxes = results.pred_boxes
+        elif results.has("proposal_boxes"):
+            output_boxes = results.proposal_boxes
+        output_boxes.scale(scale_x, scale_y)
+        output_boxes.clip(results.image_size)
+        results = results[output_boxes.nonempty()]
+        if results.has("pred_global_masks"):
+            mask_h, mask_w = results.pred_global_masks.size()[-2:]
+            factor_h = padded_im_h // mask_h
+            factor_w = padded_im_w // mask_w
+            assert factor_h == factor_w
+            factor = factor_h
+            pred_global_masks = aligned_bilinear(
+                results.pred_global_masks, factor
+            )
+            pred_global_masks = pred_global_masks[:, :, :resized_im_h, :resized_im_w]
+            pred_global_masks = F.interpolate(
+                pred_global_masks,
+                size=(output_height, output_width),
+                mode="bilinear", align_corners=False
+            )
+            pred_global_masks = pred_global_masks[:, 0, :, :]
+            results.pred_masks = (pred_global_masks > mask_threshold).float()
+            results.pred_masks_score = pred_global_masks
+        # from high score to low score
+        origin_masks = results.pred_masks
+        num_instances, H, W = origin_masks.shape
+        filter_masks = []
+        # initialize background
+        mask_0 = torch.zeros((H, W)).cuda() + 0.001
+        filter_masks.insert(0, mask_0)
+        score = 0.002
+        for index in range(num_instances):
+            mask = origin_masks[num_instances-index-1]
+            mask[mask==1] = score
+            filter_masks.insert(0, mask)
+            score = score + 0.001
+        filter_masks = torch.stack(filter_masks, dim=0)
+        _, instance_ids = torch.max(filter_masks, dim=0)
+        unique_instance_ids = torch.unique(instance_ids)
+        ori_scores = results.scores.clone()
+        has_mask_valid = []
+        for instance_id in unique_instance_ids:
+            if instance_id == num_instances:
+                continue
+            mask = (instance_ids==instance_id).float()
+            finds_y, finds_x = torch.nonzero(mask==1, as_tuple=True)
+            if len(finds_y) == 0:
+                continue
+            x1 = torch.min(finds_x)
+            x2 = torch.max(finds_x)
+            y1 = torch.min(finds_y)
+            y2 = torch.max(finds_y)
+            if x2-x1==0 or y2-y1==0:
+                continue
+            has_mask_valid.append(int(instance_id))
+            ## mask rescoring would obtain higher performance
+            if self.use_mask_rescore_infer:
+                mask_score = results.pred_masks_score[instance_id]
+                seg_scores = (mask_score * mask).sum() / mask.sum()
+                results.scores[instance_id] = results.scores[instance_id] * seg_scores
+            results.pred_masks[instance_id] = mask
+            results.pred_boxes.tensor[instance_id][0] = x1
+            results.pred_boxes.tensor[instance_id][1] = y1
+            results.pred_boxes.tensor[instance_id][2] = x2
+            results.pred_boxes.tensor[instance_id][3] = y2
+        results.ori_scores = ori_scores
+        results = results[has_mask_valid]
+        return results

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .swin import build_retinanet_swin_fpn_backbone
2	+ from .mixvision import build_retinanet_mit_fpn_backbone

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/mixvision.py ADDED Viewed

	@@ -0,0 +1,464 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool, LastLevelP6P7
+import math
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        if self.sr_ratio > 1:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        else:
+            kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).contiguous().reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class MixVisionTransformer(Backbone):
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
+                 attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1]):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        # patch_embed
+        self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans,
+                                              embed_dim=embed_dims[0])
+        self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dims[0],
+                                              embed_dim=embed_dims[1])
+        self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dims[1],
+                                              embed_dim=embed_dims[2])
+        self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dims[2],
+                                              embed_dim=embed_dims[3])
+        # transformer encoder
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+        self.block1 = nn.ModuleList([Block(
+            dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[0])
+            for i in range(depths[0])])
+        self.norm1 = norm_layer(embed_dims[0])
+        cur += depths[0]
+        self.block2 = nn.ModuleList([Block(
+            dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[1])
+            for i in range(depths[1])])
+        self.norm2 = norm_layer(embed_dims[1])
+        cur += depths[1]
+        self.block3 = nn.ModuleList([Block(
+            dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[2])
+            for i in range(depths[2])])
+        self.norm3 = norm_layer(embed_dims[2])
+        cur += depths[2]
+        self.block4 = nn.ModuleList([Block(
+            dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[3])
+            for i in range(depths[3])])
+        self.norm4 = norm_layer(embed_dims[3])
+        # classification head
+        # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+        # freeze
+        for p in self.patch_embed1.parameters():
+            p.requires_grad = False
+        for p in self.block1.parameters():
+            p.requires_grad = False
+        for p in self.norm1.parameters():
+            p.requires_grad = False
+        outs = self.forward(torch.rand(1,3,224,224).float())
+        self.output_shapes = dict()
+        self._size_divisibility = 0
+        for i, f in enumerate(outs):
+            self.output_shapes[f] = ShapeSpec(
+                channels=outs[f].shape[1], stride=224//outs[f].shape[2]
+                )
+            if i == (len(outs)-1):
+                self._size_divisibility = 224//outs[f].shape[2]
+        self.train()
+    def output_shape(self):
+        return self.output_shapes
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def reset_drop_path(self, drop_path_rate):
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
+        cur = 0
+        for i in range(self.depths[0]):
+            self.block1[i].drop_path.drop_prob = dpr[cur + i]
+        cur += self.depths[0]
+        for i in range(self.depths[1]):
+            self.block2[i].drop_path.drop_prob = dpr[cur + i]
+        cur += self.depths[1]
+        for i in range(self.depths[2]):
+            self.block3[i].drop_path.drop_prob = dpr[cur + i]
+        cur += self.depths[2]
+        for i in range(self.depths[3]):
+            self.block4[i].drop_path.drop_prob = dpr[cur + i]
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self, x):
+        B = x.shape[0]
+        outs = dict()
+        # stage 1
+        x, H, W = self.patch_embed1(x)
+        for i, blk in enumerate(self.block1):
+            x = blk(x, H, W)
+        x = self.norm1(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs["mit1"] = x
+        # stage 2
+        x, H, W = self.patch_embed2(x)
+        for i, blk in enumerate(self.block2):
+            x = blk(x, H, W)
+        x = self.norm2(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs["mit2"] = x
+        # stage 3
+        x, H, W = self.patch_embed3(x)
+        for i, blk in enumerate(self.block3):
+            x = blk(x, H, W)
+        x = self.norm3(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs["mit3"] = x
+        # stage 4
+        x, H, W = self.patch_embed4(x)
+        for i, blk in enumerate(self.block4):
+            x = blk(x, H, W)
+        x = self.norm4(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs["mit4"] = x
+        return outs
+    def forward(self, x):
+        x = self.forward_features(x)
+        # x = self.head(x)
+        return x
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).contiguous().view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class mit_b0(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b0, self).__init__(
+            patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+class mit_b1(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b1, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+class mit_b2(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b2, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+class mit_b3(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b3, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+class mit_b4(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b4, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+class mit_b5(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b5, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+@BACKBONE_REGISTRY.register()
+def build_mit_backbone(cfg, input_shape):
+    if cfg.MODEL.MIT_BACKBONE.NAME == "b0":
+        return mit_b0()
+    elif cfg.MODEL.MIT_BACKBONE.NAME == "b1":
+        return mit_b1()
+    elif cfg.MODEL.MIT_BACKBONE.NAME == "b2":
+        return mit_b2()
+    elif cfg.MODEL.MIT_BACKBONE.NAME == "b3":
+        return mit_b3()
+    elif cfg.MODEL.MIT_BACKBONE.NAME == "b4":
+        return mit_b4()
+    elif cfg.MODEL.MIT_BACKBONE.NAME == "b5":
+        return mit_b5()
+@BACKBONE_REGISTRY.register()
+def build_retinanet_mit_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_mit_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    in_channels_top = out_channels
+    top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=top_block,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/swin.py ADDED Viewed

	@@ -0,0 +1,723 @@

+# --------------------------------------------------------
+# Swin Transformer
+# modified from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool, LastLevelP6P7
+from detectron2.layers import ShapeSpec
+class Mlp(nn.Module):
+    """ Multilayer perceptron."""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwinTransformerBlock(nn.Module):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.H = None
+        self.W = None
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        x = x.view(B, H, W, C)
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+        mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+        return x
+class SwinTransformer(Backbone):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 frozen_stages=-1,
+                 use_checkpoint=False,
+                 out_features=None):
+        super(SwinTransformer, self).__init__()
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.frozen_stages = frozen_stages
+        self.out_features = out_features
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        self._out_feature_strides = {}
+        self._out_feature_channels = {}
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+            stage = f'stage{i_layer+2}'
+            if stage in self.out_features:
+                self._out_feature_channels[stage] = embed_dim * 2 ** i_layer
+                self._out_feature_strides[stage] = 4 * 2 ** i_layer
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+        # add a norm layer for each output
+        for i_layer in range(self.num_layers):
+            stage = f'stage{i_layer+2}'
+            if stage in self.out_features:
+                layer = norm_layer(num_features[i_layer])
+                layer_name = f'norm{i_layer}'
+                self.add_module(layer_name, layer)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+        self.apply(_init_weights)
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            name = f'stage{i+2}'
+            if name in self.out_features:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs[name] = out
+        return outs #{"stage%d" % (i+2,): out for i, out in enumerate(outs)} #tuple(outs)
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self.out_features
+        }
+@BACKBONE_REGISTRY.register()
+def build_swin_backbone(cfg, input_shape):
+    """
+    Create a SwinT instance from config.
+    Returns:
+        VoVNet: a :class:`VoVNet` instance.
+    """
+    out_features = cfg.MODEL.SWINT.OUT_FEATURES
+    return SwinTransformer(
+        patch_size=cfg.MODEL.SWINT.PATCH_SIZE,
+        in_chans=input_shape.channels,
+        embed_dim=cfg.MODEL.SWINT.EMBED_DIM,
+        depths=cfg.MODEL.SWINT.DEPTHS,
+        num_heads=cfg.MODEL.SWINT.NUM_HEADS,
+        window_size=cfg.MODEL.SWINT.WINDOW_SIZE,
+        mlp_ratio=cfg.MODEL.SWINT.MLP_RATIO,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=cfg.MODEL.SWINT.DROP_PATH_RATE,
+        norm_layer=nn.LayerNorm,
+        ape=cfg.MODEL.SWINT.APE,
+        patch_norm=True,
+        frozen_stages=cfg.MODEL.BACKBONE.FREEZE_AT,
+        out_features=out_features
+    )
+@BACKBONE_REGISTRY.register()
+def build_swin_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_swin_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
+class LastLevelP6(nn.Module):
+    """
+    This module is used in FCOS to generate extra layers
+    """
+    def __init__(self, in_channels, out_channels, in_features="res5"):
+        super().__init__()
+        self.num_levels = 1
+        self.in_feature = in_features
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        for module in [self.p6]:
+            weight_init.c2_xavier_fill(module)
+    def forward(self, x):
+        p6 = self.p6(x)
+        return [p6]
+@BACKBONE_REGISTRY.register()
+def build_retinanet_swin_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_swin_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    top_levels = cfg.MODEL.FPN.TOP_LEVELS
+    in_channels_top = out_channels
+    if top_levels == 2:
+        top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
+    if top_levels == 1:
+        top_block = LastLevelP6(in_channels_top, out_channels, "p5")
+    elif top_levels == 0:
+        top_block = None
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=top_block,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/config.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from detectron2.config import CfgNode as CN
+def add_entity_config(cfg):
+    """
+    Add config for Item.
+    """
+    ## FCOS Hyper-Parameters
+    cfg.MODEL.FCOS = CN()
+    # Anchor parameters
+    cfg.MODEL.FCOS.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
+    cfg.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
+    cfg.MODEL.FCOS.NUM_CLASSES = 1
+    cfg.MODEL.FCOS.SIZES_OF_INTEREST = [[-1, 64], [64,128], [128,256], [256,512], [512, 100000000]]
+    # tower
+    cfg.MODEL.FCOS.NUM_CLS_CONVS = 4
+    cfg.MODEL.FCOS.NUM_BOX_CONVS = 4
+    cfg.MODEL.FCOS.NUM_SHARE_CONVS = 0
+    cfg.MODEL.FCOS.CENTER_SAMPLE = True
+    cfg.MODEL.FCOS.POS_RADIUS = 1.5
+    cfg.MODEL.FCOS.LOC_LOSS_TYPE = 'giou'
+    cfg.MODEL.FCOS.USE_RELU = True
+    cfg.MODEL.FCOS.USE_DEFORMABLE = False
+    cfg.MODEL.FCOS.USE_SCALE  = True
+    cfg.MODEL.FCOS.TOP_LEVELS = 2
+    cfg.MODEL.FCOS.NORM = "GN"
+   # loss
+    cfg.MODEL.FCOS.PRIOR_PROB    = 0.01
+    cfg.MODEL.FCOS.LOSS_ALPHA    = 0.25
+    cfg.MODEL.FCOS.LOSS_GAMMA    = 2.0
+    cfg.MODEL.FCOS.FB_RATIO      = 4.0
+    cfg.MODEL.FCOS.CENTER_SAMPLE = True
+    cfg.MODEL.FCOS.YIELD_PROPOSAL = False
+    # inference
+    cfg.MODEL.FCOS.INFERENCE_TH_TRAIN  = 0.05
+    cfg.MODEL.FCOS.INFERENCE_TH_TEST   = 0.05
+    cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN  = 1000
+    cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST   = 1000
+    cfg.MODEL.FCOS.NMS_TH              = 0.6
+    cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN = 100
+    cfg.MODEL.FCOS.POST_NMS_TOPK_TEST  = 100
+    cfg.MODEL.FCOS.THRESH_WITH_CTR     = False
+    ## CONDINST Hyper-Parameters
+    cfg.MODEL.CONDINST = CN()
+    # the downsampling ratio of the final instance masks to the input image
+    cfg.MODEL.CONDINST.MASK_OUT_STRIDE = 4
+    cfg.MODEL.CONDINST.MAX_PROPOSALS   = 500
+    cfg.MODEL.CONDINST.TRAIN_MAX_PROPOSALS_PER_IMAGE = 120
+    cfg.MODEL.CONDINST.LOW_LEVEL_DIMENSION = 16
+    cfg.MODEL.CONDINST.CLASS_AGNOSTIC  = False
+    cfg.MODEL.CONDINST.MASK_HEAD = CN()
+    cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS = 8
+    cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS = 3
+    cfg.MODEL.CONDINST.MASK_HEAD.USE_FP16 = False
+    cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS = False
+    cfg.MODEL.CONDINST.MASK_HEAD.CLUSTER_WEIGHT = 1.0
+    cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC = ["111", "110"]
+    cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC_WEIGHT = [1.0, 1.0]
+    cfg.MODEL.CONDINST.MASK_BRANCH = CN()
+    cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS = 8
+    cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES = ["p3", "p4", "p5"]
+    cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS = 128
+    cfg.MODEL.CONDINST.MASK_BRANCH.NORM = "BN"
+    cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS = 4
+    cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON = False
+    cfg.MODEL.CONDINST.MASK_BRANCH.USE_MASK_RESCORE = False
+    ## kernel head
+    cfg.MODEL.KERNEL_HEAD    = CN()
+    cfg.MODEL.KERNEL_HEAD.NUM_CONVS       = 3
+    cfg.MODEL.KERNEL_HEAD.DEFORM          = False
+    cfg.MODEL.KERNEL_HEAD.COORD           = True
+    cfg.MODEL.KERNEL_HEAD.CONVS_DIM       = 256
+    cfg.MODEL.KERNEL_HEAD.NORM            = "GN"
+    ## swin transformer backbone
+    cfg.MODEL.SWINT = CN()
+    cfg.MODEL.SWINT.EMBED_DIM = 96
+    cfg.MODEL.SWINT.PATCH_SIZE = 4
+    cfg.MODEL.SWINT.OUT_FEATURES = ["stage2", "stage3", "stage4", "stage5"]
+    cfg.MODEL.SWINT.DEPTHS = [2, 2, 6, 2]
+    cfg.MODEL.SWINT.NUM_HEADS = [3, 6, 12, 24]
+    cfg.MODEL.SWINT.WINDOW_SIZE = 7
+    cfg.MODEL.SWINT.MLP_RATIO = 4
+    cfg.MODEL.SWINT.DROP_PATH_RATE = 0.2
+    cfg.MODEL.SWINT.APE = False
+    # # addation
+    cfg.MODEL.FPN.TOP_LEVELS = 2
+    ## mit former
+    cfg.MODEL.MIT_BACKBONE = CN()
+    cfg.MODEL.MIT_BACKBONE.NAME = "b0"
+    cfg.SOLVER.OPTIMIZER = "sgd"
+    cfg.TEST.CLASS_AGNOSTIC = True

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/__init__.py ADDED Viewed

File without changes

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/detection.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import math
+from typing import List, Dict
+import torch
+from torch import nn
+from torch.nn import functional as F
+from detectron2.structures import ImageList
+from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.backbone import build_backbone
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.postprocessing import detector_postprocess
+from .layers import DFConv2d, IOULoss
+# from .outputs_has_ignore import FCOSOutputs
+from .outputs import FCOSOutputs
+from .tower import FCOSHead
+import pdb
+import cv2
+INF = 100000000
+class FCOS(nn.Module):
+    def __init__(self, cfg, backbone_shape):
+        super().__init__()
+        self.device               = torch.device(cfg.MODEL.DEVICE)
+        self.in_features          = cfg.MODEL.FCOS.IN_FEATURES
+        self.fpn_strides          = cfg.MODEL.FCOS.FPN_STRIDES
+        self.yield_proposal       = cfg.MODEL.FCOS.YIELD_PROPOSAL
+        feature_shapes = [backbone_shape[f] for f in self.in_features]
+        self.fcos_head = FCOSHead(cfg, feature_shapes)
+        self.in_channels_to_top_module = self.fcos_head.in_channels_to_top_module
+        self.fcos_outputs = FCOSOutputs(cfg)
+        self.to(self.device)
+    def forward_head(self, features, top_module=None):
+        features = [features[f] for f in self.in_features]
+        pred_class_logits, pred_deltas, pred_centerness, bbox_towers, top_feats = self.fcos_head(features, top_module, self.yield_proposal)
+        return pred_class_logits, pred_deltas, pred_centerness, bbox_towers, top_feats
+    def forward(self, images, backbone_features, gt_instances, top_module=None):
+        """
+        Arguments:
+            images (list[Tensor] or ImageList): images to be processed
+            targets (list[BoxList]): ground-truth boxes present in the image (optional)
+        Returns:
+            result (list[BoxList] or dict[Tensor]): the output from the model.
+                During training, it returns a dict[Tensor] which contains the losses.
+                During testing, it returns list[BoxList] contains additional fields
+                like `scores`, `labels` and `mask` (for Mask R-CNN models).
+        """
+        features = [backbone_features[f] for f in self.in_features]
+        locations = self.compute_locations(features)
+        logits_pred, reg_pred, ctrness_pred, bbox_towers, top_feats = self.fcos_head(features, top_module)
+        results = {}
+        if self.yield_proposal:
+            results["features"] = {
+                f: b for f, b in zip(self.in_features, bbox_towers)
+            }
+        if self.training:
+            results, losses = self.fcos_outputs.losses(
+                logits_pred, reg_pred, ctrness_pred,
+                locations, gt_instances, top_feats
+            )
+            if self.yield_proposal:
+                with torch.no_grad():
+                    results["proposals"] = self.fcos_outputs.predict_proposals(
+                        logits_pred, reg_pred, ctrness_pred,
+                        locations, images.image_sizes, top_feats
+                    )
+            return results, losses
+        else:
+            results = self.fcos_outputs.predict_proposals(
+                logits_pred, reg_pred, ctrness_pred,
+                locations, images.image_sizes, top_feats
+            )
+            return results, {}
+    def compute_locations(self, features):
+        locations = []
+        for level, feature in enumerate(features):
+            h, w = feature.size()[-2:]
+            locations_per_level = self.compute_locations_per_level(
+                h, w, self.fpn_strides[level],
+                feature.device
+            )
+            locations.append(locations_per_level)
+        return locations
+    def compute_locations_per_level(self, h, w, stride, device):
+        shifts_x = torch.arange(
+            0, w * stride, step=stride,
+            dtype=torch.float32, device=device
+        )
+        shifts_y = torch.arange(
+            0, h * stride, step=stride,
+            dtype=torch.float32, device=device
+        )
+        shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+        shift_x = shift_x.reshape(-1)
+        shift_y = shift_y.reshape(-1)
+        locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
+        return locations
+def build_det_head(cfg, backbone_shape):
+    return FCOS(cfg, backbone_shape)

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .deform_conv import DFConv2d
+from .iou_loss import IOULoss
+from .ml_nms import ml_nms
+from .conv_with_kaiming_uniform import *

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/conv_with_kaiming_uniform.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from torch import nn
+from detectron2.layers import Conv2d
+from .deform_conv import DFConv2d
+from detectron2.layers.batch_norm import get_norm
+def conv_with_kaiming_uniform(
+        norm=None, activation=None,
+        use_deformable=False, use_sep=False):
+    def make_conv(
+        in_channels, out_channels, kernel_size, stride=1, dilation=1
+    ):
+        if use_deformable:
+            conv_func = DFConv2d
+        else:
+            conv_func = Conv2d
+        if use_sep:
+            assert in_channels == out_channels
+            groups = in_channels
+        else:
+            groups = 1
+        conv = conv_func(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=dilation * (kernel_size - 1) // 2,
+            dilation=dilation,
+            groups=groups,
+            bias=(norm is None)
+        )
+        if not use_deformable:
+            # Caffe2 implementation uses XavierFill, which in fact
+            # corresponds to kaiming_uniform_ in PyTorch
+            nn.init.kaiming_uniform_(conv.weight, a=1)
+            if norm is None:
+                nn.init.constant_(conv.bias, 0)
+        module = [conv,]
+        if norm is not None and len(norm) > 0:
+            if norm == "GN":
+                norm_module = nn.GroupNorm(32, out_channels)
+            else:
+                norm_module = get_norm(norm, out_channels)
+            module.append(norm_module)
+        if activation is not None:
+            module.append(nn.ReLU(inplace=True))
+        if len(module) > 1:
+            return nn.Sequential(*module)
+        return conv
+    return make_conv

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/deform_conv.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+from torch import nn
+from detectron2.layers import Conv2d
+class _NewEmptyTensorOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return _NewEmptyTensorOp.apply(grad, shape), None
+class DFConv2d(nn.Module):
+    """Deformable convolutional layer"""
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            with_modulated_dcn=True,
+            kernel_size=3,
+            stride=1,
+            groups=1,
+            dilation=1,
+            deformable_groups=1,
+            bias=False,
+            padding=None
+    ):
+        super(DFConv2d, self).__init__()
+        if isinstance(kernel_size, (list, tuple)):
+            assert isinstance(stride, (list, tuple))
+            assert isinstance(dilation, (list, tuple))
+            assert len(kernel_size) == 2
+            assert len(stride) == 2
+            assert len(dilation) == 2
+            padding = (
+                dilation[0] * (kernel_size[0] - 1) // 2,
+                dilation[1] * (kernel_size[1] - 1) // 2
+            )
+            offset_base_channels = kernel_size[0] * kernel_size[1]
+        else:
+            padding = dilation * (kernel_size - 1) // 2
+            offset_base_channels = kernel_size * kernel_size
+        if with_modulated_dcn:
+            from .deform_conv import ModulatedDeformConv
+            offset_channels = offset_base_channels * 3  # default: 27
+            conv_block = ModulatedDeformConv
+        else:
+            from .deform_conv import DeformConv
+            offset_channels = offset_base_channels * 2  # default: 18
+            conv_block = DeformConv
+        self.offset = Conv2d(
+            in_channels,
+            deformable_groups * offset_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=1,
+            dilation=dilation
+        )
+        for l in [self.offset, ]:
+            nn.init.kaiming_uniform_(l.weight, a=1)
+            torch.nn.init.constant_(l.bias, 0.)
+        self.conv = conv_block(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            deformable_groups=deformable_groups,
+            bias=bias
+        )
+        self.with_modulated_dcn = with_modulated_dcn
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.offset_split = offset_base_channels * deformable_groups * 2
+    def forward(self, x, return_offset=False):
+        if x.numel() > 0:
+            if not self.with_modulated_dcn:
+                offset_mask = self.offset(x)
+                x = self.conv(x, offset_mask)
+            else:
+                offset_mask = self.offset(x)
+                offset = offset_mask[:, :self.offset_split, :, :]
+                mask = offset_mask[:, self.offset_split:, :, :].sigmoid()
+                x = self.conv(x, offset, mask)
+            if return_offset:
+                return x, offset_mask
+            return x
+        # get output shape
+        output_shape = [
+            (i + 2 * p - (di * (k - 1) + 1)) // d + 1
+            for i, p, di, k, d in zip(
+                x.shape[-2:],
+                self.padding,
+                self.dilation,
+                self.kernel_size,
+                self.stride
+            )
+        ]
+        output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape
+        return _NewEmptyTensorOp.apply(x, output_shape)

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/iou_loss.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+from torch import nn
+class IOULoss(nn.Module):
+    def __init__(self, loc_loss_type='iou'):
+        super(IOULoss, self).__init__()
+        self.loc_loss_type = loc_loss_type
+    def forward(self, pred, target, weight=None):
+        pred_left = pred[:, 0]
+        pred_top = pred[:, 1]
+        pred_right = pred[:, 2]
+        pred_bottom = pred[:, 3]
+        target_left = target[:, 0]
+        target_top = target[:, 1]
+        target_right = target[:, 2]
+        target_bottom = target[:, 3]
+        target_aera = (target_left + target_right) * \
+                      (target_top + target_bottom)
+        pred_aera = (pred_left + pred_right) * \
+                    (pred_top + pred_bottom)
+        w_intersect = torch.min(pred_left, target_left) + \
+                      torch.min(pred_right, target_right)
+        h_intersect = torch.min(pred_bottom, target_bottom) + \
+                      torch.min(pred_top, target_top)
+        g_w_intersect = torch.max(pred_left, target_left) + \
+                        torch.max(pred_right, target_right)
+        g_h_intersect = torch.max(pred_bottom, target_bottom) + \
+                        torch.max(pred_top, target_top)
+        ac_uion = g_w_intersect * g_h_intersect
+        area_intersect = w_intersect * h_intersect
+        area_union = target_aera + pred_aera - area_intersect
+        ious = (area_intersect + 1.0) / (area_union + 1.0)
+        gious = ious - (ac_uion - area_union) / ac_uion
+        if self.loc_loss_type == 'iou':
+            losses = -torch.log(ious)
+        elif self.loc_loss_type == 'linear_iou':
+            losses = 1 - ious
+        elif self.loc_loss_type == 'giou':
+            losses = 1 - gious
+        else:
+            raise NotImplementedError
+        if weight is not None:
+            return (losses * weight).sum()
+        else:
+            return losses.sum()

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/ml_nms.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from detectron2.layers import batched_nms
+def ml_nms(boxlist, nms_thresh, max_proposals=-1,
+           score_field="scores", label_field="labels"):
+    """
+    Performs non-maximum suppression on a boxlist, with scores specified
+    in a boxlist field via score_field.
+    Args:
+        boxlist (detectron2.structures.Boxes):
+        nms_thresh (float):
+        max_proposals (int): if > 0, then only the top max_proposals are kept
+            after non-maximum suppression
+        score_field (str):
+    """
+    if nms_thresh <= 0:
+        return boxlist
+    boxes = boxlist.pred_boxes.tensor
+    scores = boxlist.scores
+    labels = boxlist.pred_classes
+    keep = batched_nms(boxes, scores, labels, nms_thresh)
+    if max_proposals > 0:
+        keep = keep[: max_proposals]
+    boxlist = boxlist[keep]
+    return boxlist

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/outputs.py ADDED Viewed

	@@ -0,0 +1,489 @@

+import logging
+import torch
+from torch import nn
+import torch.nn.functional as F
+from detectron2.layers import cat
+from detectron2.structures import Instances, Boxes
+from detectron2.utils.comm import get_world_size
+from fvcore.nn import sigmoid_focal_loss_jit
+from .utils import reduce_sum
+from .layers import ml_nms, IOULoss
+import pdb
+logger = logging.getLogger(__name__)
+INF = 100000000
+def compute_ctrness_targets(reg_targets):
+    if len(reg_targets) == 0:
+        return reg_targets.new_zeros(len(reg_targets))
+    left_right = reg_targets[:, [0, 2]]
+    top_bottom = reg_targets[:, [1, 3]]
+    ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
+                 (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+    return torch.sqrt(ctrness)
+class FCOSOutputs(nn.Module):
+    def __init__(self, cfg):
+        super(FCOSOutputs, self).__init__()
+        self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA
+        self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA
+        self.center_sample = cfg.MODEL.FCOS.CENTER_SAMPLE
+        self.radius = cfg.MODEL.FCOS.POS_RADIUS
+        self.pre_nms_thresh_train = cfg.MODEL.FCOS.INFERENCE_TH_TRAIN
+        self.pre_nms_topk_train = cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN
+        self.post_nms_topk_train = cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN
+        self.loc_loss_func = IOULoss(cfg.MODEL.FCOS.LOC_LOSS_TYPE)
+        self.pre_nms_thresh_test = cfg.MODEL.FCOS.INFERENCE_TH_TEST
+        self.pre_nms_topk_test = cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST
+        self.post_nms_topk_test = cfg.MODEL.FCOS.POST_NMS_TOPK_TEST
+        self.nms_thresh = cfg.MODEL.FCOS.NMS_TH
+        self.thresh_with_ctr = cfg.MODEL.FCOS.THRESH_WITH_CTR
+        self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES
+        self.strides = cfg.MODEL.FCOS.FPN_STRIDES
+        self.sizes_of_interest = cfg.MODEL.FCOS.SIZES_OF_INTEREST
+    def _transpose(self, training_targets, num_loc_list):
+        '''
+        This function is used to transpose image first training targets to level first ones
+        :return: level first training targets
+        '''
+        for im_i in range(len(training_targets)):
+            training_targets[im_i] = torch.split(
+                training_targets[im_i], num_loc_list, dim=0
+            )
+        targets_level_first = []
+        for targets_per_level in zip(*training_targets):
+            targets_level_first.append(
+                torch.cat(targets_per_level, dim=0)
+            )
+        return targets_level_first
+    def _get_ground_truth(self, locations, gt_instances):
+        num_loc_list = [len(loc) for loc in locations]
+        # compute locations to size ranges
+        loc_to_size_range = []
+        for l, loc_per_level in enumerate(locations):
+            loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
+            loc_to_size_range.append(
+                loc_to_size_range_per_level[None].expand(num_loc_list[l], -1)
+            )
+        loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
+        locations = torch.cat(locations, dim=0)
+        training_targets = self.compute_targets_for_locations(
+            locations, gt_instances, loc_to_size_range, num_loc_list
+        )
+        training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
+        training_targets["im_inds"] = [locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))]
+        # transpose im first training_targets to level first ones
+        training_targets = {
+            k: self._transpose(v, num_loc_list) for k, v in training_targets.items()
+        }
+        training_targets["fpn_levels"] = [
+            loc.new_ones(len(loc), dtype=torch.long) * level
+            for level, loc in enumerate(training_targets["locations"])
+        ]
+        # we normalize reg_targets by FPN's strides here
+        reg_targets = training_targets["reg_targets"]
+        for l in range(len(reg_targets)):
+            reg_targets[l] = reg_targets[l] / float(self.strides[l])
+        return training_targets
+    def get_sample_region(self, boxes, strides, num_loc_list, loc_xs, loc_ys, bitmasks=None, radius=1):
+        # pdb.set_trace()
+        if bitmasks is not None:
+            _, h, w = bitmasks.size()
+            ys = torch.arange(0, h, dtype=torch.float32, device=bitmasks.device)
+            xs = torch.arange(0, w, dtype=torch.float32, device=bitmasks.device)
+            m00 = bitmasks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6)
+            m10 = (bitmasks * xs).sum(dim=-1).sum(dim=-1)
+            m01 = (bitmasks * ys[:, None]).sum(dim=-1).sum(dim=-1)
+            center_x = m10 / m00
+            center_y = m01 / m00
+            center_x = center_x.float()
+            center_y = center_y.float()
+        else:
+            center_x = boxes[..., [0, 2]].sum(dim=-1) * 0.5
+            center_y = boxes[..., [1, 3]].sum(dim=-1) * 0.5
+        # pdb.set_trace()
+        num_gts = boxes.shape[0]
+        K = len(loc_xs)
+        boxes = boxes[None].expand(K, num_gts, 4)
+        center_x = center_x[None].expand(K, num_gts)
+        center_y = center_y[None].expand(K, num_gts)
+        center_gt = boxes.new_zeros(boxes.shape)
+        # no gt
+        if center_x.numel() == 0 or center_x[..., 0].sum() == 0:
+            return loc_xs.new_zeros(loc_xs.shape, dtype=torch.uint8)
+        beg = 0
+        for level, num_loc in enumerate(num_loc_list):
+            end = beg + num_loc
+            stride = strides[level] * radius
+            xmin = center_x[beg:end] - stride
+            ymin = center_y[beg:end] - stride
+            xmax = center_x[beg:end] + stride
+            ymax = center_y[beg:end] + stride
+            # limit sample region in gt
+            center_gt[beg:end, :, 0] = torch.where(xmin > boxes[beg:end, :, 0], xmin, boxes[beg:end, :, 0])
+            center_gt[beg:end, :, 1] = torch.where(ymin > boxes[beg:end, :, 1], ymin, boxes[beg:end, :, 1])
+            center_gt[beg:end, :, 2] = torch.where(xmax > boxes[beg:end, :, 2], boxes[beg:end, :, 2], xmax)
+            center_gt[beg:end, :, 3] = torch.where(ymax > boxes[beg:end, :, 3], boxes[beg:end, :, 3], ymax)
+            beg = end
+        left = loc_xs[:, None] - center_gt[..., 0]
+        right = center_gt[..., 2] - loc_xs[:, None]
+        top = loc_ys[:, None] - center_gt[..., 1]
+        bottom = center_gt[..., 3] - loc_ys[:, None]
+        center_bbox = torch.stack((left, top, right, bottom), -1)
+        inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        return inside_gt_bbox_mask
+    def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
+        labels = []
+        reg_targets = []
+        target_inds = []
+        xs, ys = locations[:, 0], locations[:, 1]
+        num_targets = 0
+        for im_i in range(len(targets)):
+            targets_per_im = targets[im_i]
+            bboxes = targets_per_im.gt_boxes.tensor
+            labels_per_im = targets_per_im.gt_classes
+            # no gt
+            if bboxes.numel() == 0:
+                labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
+                reg_targets.append(locations.new_zeros((locations.size(0), 4)))
+                target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
+                continue
+            area = targets_per_im.gt_boxes.area()
+            l = xs[:, None] - bboxes[:, 0][None]
+            t = ys[:, None] - bboxes[:, 1][None]
+            r = bboxes[:, 2][None] - xs[:, None]
+            b = bboxes[:, 3][None] - ys[:, None]
+            reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
+            if self.center_sample:
+                if targets_per_im.has("gt_bitmasks_full"):
+                    bitmasks = targets_per_im.gt_bitmasks_full
+                else:
+                    bitmasks = None
+                is_in_boxes = self.get_sample_region(
+                    bboxes, self.strides, num_loc_list, xs, ys,
+                    bitmasks=bitmasks, radius=self.radius
+                )
+            else:
+                is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0
+            max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0]
+            # limit the regression range for each location
+            is_cared_in_the_level = \
+                (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
+                (max_reg_targets_per_im <= size_ranges[:, [1]])
+            locations_to_gt_area = area[None].repeat(len(locations), 1)
+            locations_to_gt_area[is_in_boxes == 0] = INF
+            locations_to_gt_area[is_cared_in_the_level == 0] = INF
+            # if there are still more than one objects for a location,
+            # we choose the one with minimal area
+            locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
+            reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds]
+            target_inds_per_im = locations_to_gt_inds + num_targets
+            num_targets += len(targets_per_im)
+            labels_per_im = labels_per_im[locations_to_gt_inds]
+            labels_per_im[locations_to_min_area == INF] = self.num_classes
+            labels.append(labels_per_im)
+            reg_targets.append(reg_targets_per_im)
+            target_inds.append(target_inds_per_im)
+        return {
+            "labels": labels,
+            "reg_targets": reg_targets,
+            "target_inds": target_inds
+        }
+    def losses(self, logits_pred, reg_pred, ctrness_pred, locations, gt_instances, top_feats=None):
+        """
+        Return the losses from a set of FCOS predictions and their associated ground-truth.
+        Returns:
+            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
+        """
+        training_targets = self._get_ground_truth(locations, gt_instances)
+        # Collect all logits and regression predictions over feature maps
+        # and images to arrive at the same shape as the labels and targets
+        # The final ordering is L, N, H, W from slowest to fastest axis.
+        instances = Instances((0, 0))
+        instances.labels = cat([
+            # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
+            x.reshape(-1) for x in training_targets["labels"]
+        ], dim=0)
+        instances.gt_inds = cat([
+            # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
+            x.reshape(-1) for x in training_targets["target_inds"]
+        ], dim=0)
+        instances.im_inds = cat([
+            x.reshape(-1) for x in training_targets["im_inds"]
+        ], dim=0)
+        instances.reg_targets = cat([
+            # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4)
+            x.reshape(-1, 4) for x in training_targets["reg_targets"]
+        ], dim=0,)
+        instances.locations = cat([
+            x.reshape(-1, 2) for x in training_targets["locations"]
+        ], dim=0)
+        instances.fpn_levels = cat([
+            x.reshape(-1) for x in training_targets["fpn_levels"]
+        ], dim=0)
+        instances.logits_pred = cat([
+            # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C)
+            x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits_pred
+        ], dim=0,)
+        instances.reg_pred = cat([
+            # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B)
+            x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred
+        ], dim=0,)
+        instances.ctrness_pred = cat([
+            # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
+            x.permute(0, 2, 3, 1).reshape(-1) for x in ctrness_pred
+        ], dim=0,)
+        if len(top_feats) > 0:
+            instances.top_feats = cat([
+                # Reshape: (N, -1, Hi, Wi) -> (N*Hi*Wi, -1)
+                x.permute(0, 2, 3, 1).reshape(-1, x.size(1)) for x in top_feats
+            ], dim=0,)
+        return self.fcos_losses(instances)
+    def fcos_losses(self, instances):
+        num_classes = instances.logits_pred.size(1)
+        assert num_classes == self.num_classes
+        labels = instances.labels.flatten()
+        pos_inds = torch.nonzero(labels != num_classes).squeeze(1)
+        num_pos_local = pos_inds.numel()
+        num_gpus = get_world_size()
+        total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
+        num_pos_avg = max(total_num_pos / num_gpus, 1.0)
+        # prepare one_hot
+        class_target = torch.zeros_like(instances.logits_pred)
+        class_target[pos_inds, labels[pos_inds]] = 1
+        class_loss = sigmoid_focal_loss_jit(
+            instances.logits_pred,
+            class_target,
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        ) / num_pos_avg
+        instances = instances[pos_inds]
+        instances.pos_inds = pos_inds
+        ctrness_targets = compute_ctrness_targets(instances.reg_targets)
+        ctrness_targets_sum = ctrness_targets.sum()
+        loss_denorm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
+        instances.gt_ctrs = ctrness_targets
+        if pos_inds.numel() > 0:
+            reg_loss = self.loc_loss_func(
+                instances.reg_pred,
+                instances.reg_targets,
+                ctrness_targets
+            ) / loss_denorm
+            ctrness_loss = F.binary_cross_entropy_with_logits(
+                instances.ctrness_pred,
+                ctrness_targets,
+                reduction="sum"
+            ) / num_pos_avg
+        else:
+            reg_loss = instances.reg_pred.sum() * 0
+            ctrness_loss = instances.ctrness_pred.sum() * 0
+        losses = {
+            "loss_fcos_cls": class_loss,
+            "loss_fcos_loc": reg_loss,
+            "loss_fcos_ctr": ctrness_loss
+        }
+        extras = {
+            "instances": instances,
+            "loss_denorm": loss_denorm
+        }
+        return extras, losses
+    def predict_proposals(
+            self, logits_pred, reg_pred, ctrness_pred,
+            locations, image_sizes, top_feats=None
+    ):
+        if self.training:
+            self.pre_nms_thresh = self.pre_nms_thresh_train
+            self.pre_nms_topk = self.pre_nms_topk_train
+            self.post_nms_topk = self.post_nms_topk_train
+        else:
+            self.pre_nms_thresh = self.pre_nms_thresh_test
+            self.pre_nms_topk = self.pre_nms_topk_test
+            self.post_nms_topk = self.post_nms_topk_test
+        sampled_boxes = []
+        bundle = {
+            "l": locations, "o": logits_pred,
+            "r": reg_pred, "c": ctrness_pred,
+            "s": self.strides,
+        }
+        if len(top_feats) > 0:
+            bundle["t"] = top_feats
+        for i, per_bundle in enumerate(zip(*bundle.values())):
+            # get per-level bundle
+            per_bundle = dict(zip(bundle.keys(), per_bundle))
+            # recall that during training, we normalize regression targets with FPN's stride.
+            # we denormalize them here.
+            l = per_bundle["l"]
+            o = per_bundle["o"]
+            r = per_bundle["r"] * per_bundle["s"]
+            c = per_bundle["c"]
+            t = per_bundle["t"] if "t" in bundle else None
+            sampled_boxes.append(
+                self.forward_for_single_feature_map(
+                    l, o, r, c, image_sizes, t
+                )
+            )
+            for per_im_sampled_boxes in sampled_boxes[-1]:
+                per_im_sampled_boxes.fpn_levels = l.new_ones(
+                    len(per_im_sampled_boxes), dtype=torch.long
+                ) * i
+        boxlists = list(zip(*sampled_boxes))
+        boxlists = [Instances.cat(boxlist) for boxlist in boxlists]
+        boxlists = self.select_over_all_levels(boxlists)
+        return boxlists
+    def forward_for_single_feature_map(
+            self, locations, logits_pred, reg_pred,
+            ctrness_pred, image_sizes, top_feat=None
+    ):
+        N, C, H, W = logits_pred.shape
+        # put in the same format as locations
+        logits_pred = logits_pred.view(N, C, H, W).permute(0, 2, 3, 1)
+        logits_pred = logits_pred.reshape(N, -1, C).sigmoid()
+        box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1)
+        box_regression = box_regression.reshape(N, -1, 4)
+        ctrness_pred = ctrness_pred.view(N, 1, H, W).permute(0, 2, 3, 1)
+        ctrness_pred = ctrness_pred.reshape(N, -1).sigmoid()
+        if top_feat is not None:
+            top_feat = top_feat.view(N, -1, H, W).permute(0, 2, 3, 1)
+            top_feat = top_feat.reshape(N, H * W, -1)
+        # if self.thresh_with_ctr is True, we multiply the classification
+        # scores with centerness scores before applying the threshold.
+        if self.thresh_with_ctr:
+            logits_pred = logits_pred * ctrness_pred[:, :, None]
+        candidate_inds = logits_pred > self.pre_nms_thresh
+        pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
+        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_topk)
+        if not self.thresh_with_ctr:
+            logits_pred = logits_pred * ctrness_pred[:, :, None]
+        results = []
+        for i in range(N):
+            per_box_cls = logits_pred[i]
+            per_candidate_inds = candidate_inds[i]
+            per_box_cls = per_box_cls[per_candidate_inds]
+            per_candidate_nonzeros = per_candidate_inds.nonzero()
+            per_box_loc = per_candidate_nonzeros[:, 0]
+            per_class = per_candidate_nonzeros[:, 1]
+            per_box_regression = box_regression[i]
+            per_box_regression = per_box_regression[per_box_loc]
+            per_locations = locations[per_box_loc]
+            if top_feat is not None:
+                per_top_feat = top_feat[i]
+                per_top_feat = per_top_feat[per_box_loc]
+            per_pre_nms_top_n = pre_nms_top_n[i]
+            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
+                per_box_cls, top_k_indices = \
+                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)
+                per_class = per_class[top_k_indices]
+                per_box_regression = per_box_regression[top_k_indices]
+                per_locations = per_locations[top_k_indices]
+                if top_feat is not None:
+                    per_top_feat = per_top_feat[top_k_indices]
+            detections = torch.stack([
+                per_locations[:, 0] - per_box_regression[:, 0],
+                per_locations[:, 1] - per_box_regression[:, 1],
+                per_locations[:, 0] + per_box_regression[:, 2],
+                per_locations[:, 1] + per_box_regression[:, 3],
+            ], dim=1)
+            boxlist = Instances(image_sizes[i])
+            boxlist.pred_boxes = Boxes(detections)
+            boxlist.scores = torch.sqrt(per_box_cls)
+            boxlist.pred_classes = per_class
+            boxlist.locations = per_locations
+            if top_feat is not None:
+                boxlist.top_feat = per_top_feat
+            results.append(boxlist)
+        return results
+    def select_over_all_levels(self, boxlists):
+        num_images = len(boxlists)
+        results = []
+        for i in range(num_images):
+            # multiclass nms
+            result = ml_nms(boxlists[i], self.nms_thresh)
+            number_of_detections = len(result)
+            # Limit to max_per_image detections **over all classes**
+            if number_of_detections > self.post_nms_topk > 0:
+                cls_scores = result.scores
+                image_thresh, _ = torch.kthvalue(
+                    cls_scores.cpu(),
+                    number_of_detections - self.post_nms_topk + 1
+                )
+                keep = cls_scores >= image_thresh.item()
+                keep = torch.nonzero(keep).squeeze(1)
+                result = result[keep]
+            results.append(result)
+        return results

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/tower.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import math
+from typing import List, Dict
+import torch
+from torch import nn
+from torch.nn import functional as F
+from detectron2.layers import ShapeSpec
+from .layers import DFConv2d, IOULoss
+class Scale(nn.Module):
+    def __init__(self, init_value=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.FloatTensor([init_value]))
+    def forward(self, input):
+        return input * self.scale
+class FCOSHead(nn.Module):
+    def __init__(self, cfg, input_shape: List[ShapeSpec]):
+        """
+        Arguments:
+            in_channels (int): number of channels of the input feature
+        """
+        super().__init__()
+        self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES
+        self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
+        head_configs = {"cls": (cfg.MODEL.FCOS.NUM_CLS_CONVS, False),
+                        "bbox": (cfg.MODEL.FCOS.NUM_BOX_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE),
+                        "share": (cfg.MODEL.FCOS.NUM_SHARE_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE)}
+        norm = None if cfg.MODEL.FCOS.NORM == "none" else cfg.MODEL.FCOS.NORM
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+        self.in_channels_to_top_module = in_channels
+        for head in head_configs:
+            tower = []
+            num_convs, use_deformable = head_configs[head]
+            if use_deformable:
+                conv_func = DFConv2d
+            else:
+                conv_func = nn.Conv2d
+            for i in range(num_convs):
+                tower.append(conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True))
+                if norm == "GN":
+                    tower.append(nn.GroupNorm(32, in_channels))
+                tower.append(nn.ReLU())
+            self.add_module('{}_tower'.format(head), nn.Sequential(*tower))
+        self.cls_logits = nn.Conv2d(in_channels, self.num_classes, kernel_size=3, stride=1, padding=1)
+        self.bbox_pred  = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1, bias=False)
+        self.ctrness    = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=False)
+        if cfg.MODEL.FCOS.USE_SCALE:
+            self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in self.fpn_strides])
+        else:
+            self.scales = None
+        for modules in [self.cls_tower, self.bbox_tower, self.share_tower, self.cls_logits]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.normal_(l.weight, std=0.01)
+                    torch.nn.init.constant_(l.bias, 0)
+        for modules in [self.bbox_pred, self.ctrness]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.normal_(l.weight, std=0.01)
+        # initialize the bias for focal loss
+        prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        torch.nn.init.constant_(self.cls_logits.bias, bias_value)
+    def forward(self, x, top_module=None):
+        logits      = []
+        bbox_reg    = []
+        ctrness     = []
+        top_feats   = []
+        bbox_towers = []
+        for l, feature in enumerate(x):
+            feature = self.share_tower(feature)
+            cls_tower = self.cls_tower(feature)
+            bbox_tower = self.bbox_tower(feature)
+            logits.append(self.cls_logits(cls_tower))
+            ctrness.append(self.ctrness(bbox_tower))
+            reg = self.bbox_pred(bbox_tower)
+            if self.scales is not None:
+                reg = self.scales[l](reg)
+            # Note that we use relu, as in the improved FCOS, instead of exp.
+            bbox_reg.append(F.relu(reg))
+            if top_module is not None:
+                top_feats.append(top_module(bbox_tower))
+        return logits, bbox_reg, ctrness, bbox_towers, top_feats

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .comm import reduce_sum
2	+ from .measures import *

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/comm.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+from detectron2.utils.comm import get_world_size
+def reduce_sum(tensor):
+    world_size = get_world_size()
+    if world_size < 2:
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+    return tensor
+def aligned_bilinear(tensor, factor):
+    assert tensor.dim() == 4
+    assert factor >= 1
+    assert int(factor) == factor
+    if factor == 1:
+        return tensor
+    h, w = tensor.size()[2:]
+    tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
+    oh = factor * h + 1
+    ow = factor * w + 1
+    tensor = F.interpolate(
+        tensor, size=(oh, ow),
+        mode='bilinear',
+        align_corners=True
+    )
+    tensor = F.pad(
+        tensor, pad=(factor // 2, 0, factor // 2, 0),
+        mode="replicate"
+    )
+    return tensor[:, :, :oh - 1, :ow - 1]
+def compute_locations(h, w, stride, device):
+    shifts_x = torch.arange(
+        0, w * stride, step=stride,
+        dtype=torch.float32, device=device
+    )
+    shifts_y = torch.arange(
+        0, h * stride, step=stride,
+        dtype=torch.float32, device=device
+    )
+    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+    shift_x = shift_x.reshape(-1)
+    shift_y = shift_y.reshape(-1)
+    locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
+    return locations

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/measures.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# coding: utf-8
+# Adapted from https://github.com/ShichenLiu/CondenseNet/blob/master/utils.py
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import division
+import operator
+from functools import reduce
+def get_num_gen(gen):
+    return sum(1 for x in gen)
+def is_pruned(layer):
+    try:
+        layer.mask
+        return True
+    except AttributeError:
+        return False
+def is_leaf(model):
+    return get_num_gen(model.children()) == 0
+def get_layer_info(layer):
+    layer_str = str(layer)
+    type_name = layer_str[:layer_str.find('(')].strip()
+    return type_name
+def get_layer_param(model):
+    return sum([reduce(operator.mul, i.size(), 1) for i in model.parameters()])
+### The input batch size should be 1 to call this function
+def measure_layer(layer, *args):
+    global count_ops, count_params
+    for x in args:
+        delta_ops = 0
+        delta_params = 0
+        multi_add = 1
+        type_name = get_layer_info(layer)
+        ### ops_conv
+        if type_name in ['Conv2d']:
+            out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0] - layer.kernel_size[0]) /
+                        layer.stride[0] + 1)
+            out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1] - layer.kernel_size[1]) /
+                        layer.stride[1] + 1)
+            delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
+            delta_params = get_layer_param(layer)
+        elif type_name in ['ConvTranspose2d']:
+            _, _, in_h, in_w = x.size()
+            out_h = int((in_h-1)*layer.stride[0] - 2 * layer.padding[0] + layer.kernel_size[0] + layer.output_padding[0])
+            out_w = int((in_w-1)*layer.stride[1] - 2 * layer.padding[1] + layer.kernel_size[1] + layer.output_padding[1])
+            delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] *  \
+                        layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
+            delta_params = get_layer_param(layer)
+        ### ops_learned_conv
+        elif type_name in ['LearnedGroupConv']:
+            measure_layer(layer.relu, x)
+            measure_layer(layer.norm, x)
+            conv = layer.conv
+            out_h = int((x.size()[2] + 2 * conv.padding[0] - conv.kernel_size[0]) /
+                        conv.stride[0] + 1)
+            out_w = int((x.size()[3] + 2 * conv.padding[1] - conv.kernel_size[1]) /
+                        conv.stride[1] + 1)
+            delta_ops = conv.in_channels * conv.out_channels * conv.kernel_size[0] * conv.kernel_size[1] * out_h * out_w / layer.condense_factor * multi_add
+            delta_params = get_layer_param(conv) / layer.condense_factor
+        ### ops_nonlinearity
+        elif type_name in ['ReLU', 'ReLU6']:
+            delta_ops = x.numel()
+            delta_params = get_layer_param(layer)
+        ### ops_pooling
+        elif type_name in ['AvgPool2d', 'MaxPool2d']:
+            in_w = x.size()[2]
+            kernel_ops = layer.kernel_size * layer.kernel_size
+            out_w = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
+            out_h = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
+            delta_ops = x.size()[0] * x.size()[1] * out_w * out_h * kernel_ops
+            delta_params = get_layer_param(layer)
+        elif type_name in ['LastLevelMaxPool']:
+            pass
+        elif type_name in ['AdaptiveAvgPool2d']:
+            delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
+            delta_params = get_layer_param(layer)
+        elif type_name in ['ZeroPad2d', 'RetinaNetPostProcessor']:
+            pass
+            #delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
+            #delta_params = get_layer_param(layer)
+        ### ops_linear
+        elif type_name in ['Linear']:
+            weight_ops = layer.weight.numel() * multi_add
+            bias_ops = layer.bias.numel()
+            delta_ops = x.size()[0] * (weight_ops + bias_ops)
+            delta_params = get_layer_param(layer)
+        ### ops_nothing
+        elif type_name in ['BatchNorm2d', 'Dropout2d', 'DropChannel', 'Dropout', 'FrozenBatchNorm2d', 'GroupNorm']:
+            delta_params = get_layer_param(layer)
+        elif type_name in ['SumTwo']:
+            delta_ops = x.numel()
+        elif type_name in ['AggregateCell']:
+            if not layer.pre_transform:
+                delta_ops = 2 * x.numel() # twice for each input
+            else:
+                measure_layer(layer.branch_1, x)
+                measure_layer(layer.branch_2, x)
+                delta_params = get_layer_param(layer)
+        elif type_name in ['Identity', 'Zero']:
+            pass
+        elif type_name in ['Scale']:
+            delta_params = get_layer_param(layer)
+            delta_ops = x.numel()
+        elif type_name in ['FCOSPostProcessor', 'RPNPostProcessor', 'KeypointPostProcessor',
+                           'ROIAlign', 'PostProcessor', 'KeypointRCNNPredictor',
+                           'NaiveSyncBatchNorm', 'Upsample', 'Sequential']:
+            pass
+        elif type_name in ['DeformConv']:
+            # don't count bilinear
+            offset_conv = list(layer.parameters())[0]
+            delta_ops = reduce(operator.mul, offset_conv.size(), x.size()[2] * x.size()[3])
+            out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0]
+                         - layer.kernel_size[0]) / layer.stride[0] + 1)
+            out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1]
+                         - layer.kernel_size[1]) / layer.stride[1] + 1)
+            delta_ops += layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
+            delta_params = get_layer_param(layer)
+        ### unknown layer type
+        else:
+            raise TypeError('unknown layer type: %s' % type_name)
+        count_ops += delta_ops
+        count_params += delta_params
+    return
+def measure_model(model, x):
+    global count_ops, count_params
+    count_ops = 0
+    count_params = 0
+    def should_measure(x):
+        return is_leaf(x) or is_pruned(x)
+    def modify_forward(model):
+        for child in model.children():
+            if should_measure(child):
+                def new_forward(m):
+                    def lambda_forward(*args):
+                        measure_layer(m, *args)
+                        return m.old_forward(*args)
+                    return lambda_forward
+                child.old_forward = child.forward
+                child.forward = new_forward(child)
+            else:
+                modify_forward(child)
+    def restore_forward(model):
+        for child in model.children():
+            # leaf node
+            if is_leaf(child) and hasattr(child, 'old_forward'):
+                child.forward = child.old_forward
+                child.old_forward = None
+            else:
+                restore_forward(child)
+    modify_forward(model)
+    out = model.forward(x)
+    restore_forward(model)
+    return out, count_ops, count_params

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/__init__.py ADDED Viewed

File without changes

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/entity_evaluation.py ADDED Viewed

	@@ -0,0 +1,523 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import contextlib
+import copy
+import io
+import itertools
+import json
+import logging
+import numpy as np
+import os
+import pickle
+from collections import OrderedDict
+import pycocotools.mask as mask_util
+import torch
+from fvcore.common.file_io import PathManager
+from pycocotools.coco import COCO
+from tabulate import tabulate
+import detectron2.utils.comm as comm
+from detectron2.data import MetadataCatalog
+from detectron2.data.datasets.coco import convert_to_coco_json
+from detectron2.evaluation.evaluator import DatasetEvaluator
+from detectron2.evaluation.fast_eval_api import COCOeval_opt as COCOeval
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.logger import create_small_table
+import pdb
+class COCOEvaluator_ClassAgnostic(DatasetEvaluator):
+    """
+    Evaluate AR for object proposals, AP for instance detection/segmentation, AP
+    for keypoint detection outputs using COCO's metrics.
+    See http://cocodataset.org/#detection-eval and
+    http://cocodataset.org/#keypoints-eval to understand its metrics.
+    In addition to COCO, this evaluator is able to support any bounding box detection,
+    instance segmentation, or keypoint detection dataset.
+    """
+    def __init__(self, dataset_name, cfg, distributed, output_dir=None):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have either the following corresponding metadata:
+                    "json_file": the path to the COCO format annotation
+                Or it must be in detectron2's standard dataset format
+                so it can be converted to COCO format automatically.
+            cfg (CfgNode): config instance
+            distributed (True): if True, will collect results from all ranks and run evaluation
+                in the main process.
+                Otherwise, will evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump all
+                results predicted on the dataset. The dump contains two files:
+                1. "instance_predictions.pth" a file in torch serialization
+                   format that contains all the raw original predictions.
+                2. "coco_instances_results.json" a json file in COCO's result
+                   format.
+        """
+        self._tasks = self._tasks_from_config(cfg)
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+        self._metadata = MetadataCatalog.get(dataset_name)
+        if not hasattr(self._metadata, "json_file"):
+            self._logger.info(
+                f"'{dataset_name}' is not registered by `register_coco_instances`."
+                " Therefore trying to convert it to COCO format ..."
+            )
+            cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json")
+            self._metadata.json_file = cache_path
+            convert_to_coco_json(dataset_name, cache_path)
+        # pdb.set_trace()
+        # if self._metadata.select:
+            # self._metadata.json_file = os.path.join("individual", self._metadata.json_file.split(".")[0]+"_{}.json".format(self._metadata.select))
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._coco_api = COCO(json_file, cfg.TEST.CLASS_AGNOSTIC)
+        self._kpt_oks_sigmas = cfg.TEST.KEYPOINT_OKS_SIGMAS
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the COCO evaluation server).
+        self._do_evaluation = "annotations" in self._coco_api.dataset
+    def reset(self):
+        self._predictions = []
+    def _tasks_from_config(self, cfg):
+        """
+        Returns:
+            tuple[str]: tasks that can be evaluated under the given configuration.
+        """
+        tasks = ("bbox",)
+        if cfg.MODEL.MASK_ON:
+            tasks = tasks + ("segm",)
+        if cfg.MODEL.KEYPOINT_ON:
+            tasks = tasks + ("keypoints",)
+        return tasks
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+            # TODO this is ugly
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            self._predictions.append(prediction)
+    def evaluate(self):
+        if self._distributed:
+            comm.synchronize()
+            predictions = comm.gather(self._predictions, dst=0)
+            predictions = list(itertools.chain(*predictions))
+            if not comm.is_main_process():
+                return {}
+        else:
+            predictions = self._predictions
+        if len(predictions) == 0:
+            self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
+            return {}
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "instances_predictions.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(predictions, f)
+        self._results = OrderedDict()
+        if "proposals" in predictions[0]:
+            self._eval_box_proposals(predictions)
+        if "instances" in predictions[0]:
+            self._eval_predictions(set(self._tasks), predictions)
+        # Copy so the caller can do whatever with results
+        return copy.deepcopy(self._results)
+    def _eval_predictions(self, tasks, predictions):
+        """
+        Evaluate predictions on the given tasks.
+        Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        # unmap the category ids for COCO
+        # if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+        #     reverse_id_mapping = {
+        #         v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
+        #     }
+        for result in coco_results:
+            result["category_id"] = 1
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info("Evaluating predictions ...")
+        if "segmentation" in coco_results[0]:
+            tasks = ["bbox", "segm"]
+        else:
+            task  = ["bbox"]
+        for task in sorted(tasks):
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas
+                )
+                if len(coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+            res = self._derive_coco_results(
+                coco_eval, task
+            )
+            self._results[task] = res
+    def _eval_box_proposals(self, predictions):
+        """
+        Evaluate the box proposals in predictions.
+        Fill self._results with the metrics for "box_proposals" task.
+        """
+        if self._output_dir:
+            # Saving generated box proposals to file.
+            # Predicted box_proposals are in XYXY_ABS mode.
+            bbox_mode = BoxMode.XYXY_ABS.value
+            ids, boxes, objectness_logits = [], [], []
+            for prediction in predictions:
+                ids.append(prediction["image_id"])
+                boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
+                objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
+            proposal_data = {
+                "boxes": boxes,
+                "objectness_logits": objectness_logits,
+                "ids": ids,
+                "bbox_mode": bbox_mode,
+            }
+            with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
+                pickle.dump(proposal_data, f)
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info("Evaluating bbox proposals ...")
+        res = {}
+        areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
+        for limit in [100, 1000]:
+            for area, suffix in areas.items():
+                stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit)
+                key = "AR{}@{:d}".format(suffix, limit)
+                res[key] = float(stats["ar"].item() * 100)
+        self._logger.info("Proposal metrics: \n" + create_small_table(res))
+        self._results["box_proposals"] = res
+    def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
+        """
+        Derive the desired score numbers from summarized COCOeval.
+        Args:
+            coco_eval (None or COCOEval): None represents no predictions from model.
+            iou_type (str):
+            class_names (None or list[str]): if provided, will use it to predict
+                per-category AP.
+        Returns:
+            a dict of {metric name: score}
+        """
+        metrics = {
+            "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
+            "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
+        }[iou_type]
+        if coco_eval is None:
+            self._logger.warn("No predictions from the model!")
+            return {metric: float("nan") for metric in metrics}
+        # the standard metrics
+        results = {
+            metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
+            for idx, metric in enumerate(metrics)
+        }
+        self._logger.info(
+            "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
+        )
+        if not np.isfinite(sum(results.values())):
+            self._logger.info("Some metrics cannot be computed and is shown as NaN.")
+        if class_names is None or len(class_names) <= 1:
+            return results
+        # Compute per-category AP
+        # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
+        precisions = coco_eval.eval["precision"]
+        # precision has dims (iou, recall, cls, area range, max dets)
+        assert len(class_names) == precisions.shape[2]
+        results_per_category = []
+        for idx, name in enumerate(class_names):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results_per_category.append(("{}".format(name), float(ap * 100)))
+        # tabulate it
+        N_COLS = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
+        table = tabulate(
+            results_2d,
+            tablefmt="pipe",
+            floatfmt=".3f",
+            headers=["category", "AP"] * (N_COLS // 2),
+            numalign="left",
+        )
+        self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
+        results.update({"AP-" + name: ap for name, ap in results_per_category})
+        return results
+def instances_to_coco_json(instances, img_id):
+    """
+    Dump an "Instances" object to a COCO-format json that's used for evaluation.
+    Args:
+        instances (Instances):
+        img_id (int): the image id
+    Returns:
+        list[dict]: list of json annotations in COCO format.
+    """
+    num_instance = len(instances)
+    if num_instance == 0:
+        return []
+    boxes = instances.pred_boxes.tensor.numpy()
+    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    boxes = boxes.tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+    has_mask = instances.has("pred_masks")
+    if has_mask:
+        # use RLE to encode the masks, because they are too large and takes memory
+        # since this evaluator stores outputs of the entire dataset
+        rles = [
+            mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
+            for mask in instances.pred_masks
+        ]
+        for rle in rles:
+            # "counts" is an array encoded by mask_util as a byte-stream. Python3's
+            # json writer which always produces strings cannot serialize a bytestream
+            # unless you decode it. Thankfully, utf-8 works out (which is also what
+            # the pycocotools/_mask.pyx does).
+            rle["counts"] = rle["counts"].decode("utf-8")
+    has_keypoints = instances.has("pred_keypoints")
+    if has_keypoints:
+        keypoints = instances.pred_keypoints
+    results = []
+    for k in range(num_instance):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": boxes[k],
+            "score": scores[k],
+        }
+        if has_mask:
+            result["segmentation"] = rles[k]
+        if has_keypoints:
+            # In COCO annotations,
+            # keypoints coordinates are pixel indices.
+            # However our predictions are floating point coordinates.
+            # Therefore we subtract 0.5 to be consistent with the annotation format.
+            # This is the inverse of data loading logic in `datasets/coco.py`.
+            keypoints[k][:, :2] -= 0.5
+            result["keypoints"] = keypoints[k].flatten().tolist()
+        results.append(result)
+    return results
+# inspired from Detectron:
+# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
+def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None):
+    """
+    Evaluate detection proposal recall metrics. This function is a much
+    faster alternative to the official COCO API recall evaluation code. However,
+    it produces slightly different results.
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {
+        "all": 0,
+        "small": 1,
+        "medium": 2,
+        "large": 3,
+        "96-128": 4,
+        "128-256": 5,
+        "256-512": 6,
+        "512-inf": 7,
+    }
+    area_ranges = [
+        [0 ** 2, 1e5 ** 2],  # all
+        [0 ** 2, 32 ** 2],  # small
+        [32 ** 2, 96 ** 2],  # medium
+        [96 ** 2, 1e5 ** 2],  # large
+        [96 ** 2, 128 ** 2],  # 96-128
+        [128 ** 2, 256 ** 2],  # 128-256
+        [256 ** 2, 512 ** 2],  # 256-512
+        [512 ** 2, 1e5 ** 2],
+    ]  # 512-inf
+    assert area in areas, "Unknown area range: {}".format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = []
+    num_pos = 0
+    for prediction_dict in dataset_predictions:
+        predictions = prediction_dict["proposals"]
+        # sort predictions in descending order
+        # TODO maybe remove this and make it explicit in the documentation
+        inds = predictions.objectness_logits.sort(descending=True)[1]
+        predictions = predictions[inds]
+        ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"])
+        anno = coco_api.loadAnns(ann_ids)
+        gt_boxes = [
+            BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+            for obj in anno
+            if obj["iscrowd"] == 0
+        ]
+        gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4)  # guard against no boxes
+        gt_boxes = Boxes(gt_boxes)
+        gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
+        if len(gt_boxes) == 0 or len(predictions) == 0:
+            continue
+        valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
+        gt_boxes = gt_boxes[valid_gt_inds]
+        num_pos += len(gt_boxes)
+        if len(gt_boxes) == 0:
+            continue
+        if limit is not None and len(predictions) > limit:
+            predictions = predictions[:limit]
+        overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
+        _gt_overlaps = torch.zeros(len(gt_boxes))
+        for j in range(min(len(predictions), len(gt_boxes))):
+            # find which proposal box maximally covers each gt box
+            # and get the iou amount of coverage for each gt box
+            max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+            # find which gt box is 'best' covered (i.e. 'best' = most iou)
+            gt_ovr, gt_ind = max_overlaps.max(dim=0)
+            assert gt_ovr >= 0
+            # find the proposal box that covers the best covered gt box
+            box_ind = argmax_overlaps[gt_ind]
+            # record the iou coverage of this gt box
+            _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+            assert _gt_overlaps[j] == gt_ovr
+            # mark the proposal box and the gt box as used
+            overlaps[box_ind, :] = -1
+            overlaps[:, gt_ind] = -1
+        # append recorded iou coverage level
+        gt_overlaps.append(_gt_overlaps)
+    gt_overlaps = (
+        torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
+    )
+    gt_overlaps, _ = torch.sort(gt_overlaps)
+    if thresholds is None:
+        step = 0.05
+        thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
+    recalls = torch.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+        recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {
+        "ar": ar,
+        "recalls": recalls,
+        "thresholds": thresholds,
+        "gt_overlaps": gt_overlaps,
+        "num_pos": num_pos,
+    }
+def _evaluate_predictions_on_coco(coco_gt, coco_results, iou_type, kpt_oks_sigmas=None):
+    """
+    Evaluate the coco results using COCOEval API.
+    """
+    assert len(coco_results) > 0
+    if iou_type == "segm":
+        coco_results = copy.deepcopy(coco_results)
+        # When evaluating mask AP, if the results contain bbox, cocoapi will
+        # use the box area as the area of the instance, instead of the mask area.
+        # This leads to a different definition of small/medium/large.
+        # We remove the bbox field to let mask AP use mask area.
+        for c in coco_results:
+            c.pop("bbox", None)
+    coco_dt = coco_gt.loadRes(coco_results)
+    coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
+    if iou_type == "keypoints":
+        # Use the COCO default keypoint OKS sigmas unless overrides are specified
+        if kpt_oks_sigmas:
+            assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "pycocotools is too old!"
+            coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas)
+        # COCOAPI requires every detection and every gt to have keypoints, so
+        # we just take the first entry from both
+        num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3
+        num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3
+        num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas)
+        assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, (
+            f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. "
+            f"Ground truth contains {num_keypoints_gt} keypoints. "
+            f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. "
+            "They have to agree with each other. For meaning of OKS, please refer to "
+            "http://cocodataset.org/#keypoints-eval."
+        )
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    return coco_eval

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .mask_branch import build_mask_branch
2	+ from .dynamic_mask_head import build_dynamic_mask_head

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/dynamic_mask_head.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import torch
+from torch.nn import functional as F
+from torch import nn
+from ..det_head.utils.comm import compute_locations, aligned_bilinear
+from fvcore.nn import sigmoid_focal_loss_jit
+from .utils import sigmoid_focal_loss_boundary, sigmoid_focal_loss_boundary_jit
+import pdb
+def dice_coefficient(x, target):
+    eps = 1e-5
+    n_inst = x.size(0)
+    x = x.reshape(n_inst, -1)
+    target = target.reshape(n_inst, -1)
+    intersection = (x * target).sum(dim=1)
+    union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
+    loss = 1. - (2 * intersection / union)
+    return loss
+def parse_dynamic_params(params, channels, weight_nums, bias_nums):
+    assert params.dim() == 2
+    assert len(weight_nums) == len(bias_nums)
+    assert params.size(1) == sum(weight_nums) + sum(bias_nums)
+    num_insts = params.size(0)
+    num_layers = len(weight_nums)
+    params_splits = list(torch.split_with_sizes(params, weight_nums + bias_nums, dim=1))
+    weight_splits = params_splits[:num_layers]
+    bias_splits = params_splits[num_layers:]
+    for l in range(num_layers):
+        if l < num_layers - 1:
+            # out_channels x in_channels x 1 x 1
+            weight_splits[l] = weight_splits[l].reshape(num_insts * channels, -1, 1, 1)
+            bias_splits[l] = bias_splits[l].reshape(num_insts * channels)
+        else:
+            # out_channels x in_channels x 1 x 1
+            weight_splits[l] = weight_splits[l].reshape(num_insts * 1, -1, 1, 1)
+            bias_splits[l] = bias_splits[l].reshape(num_insts)
+    return weight_splits, bias_splits
+def build_dynamic_mask_head(cfg):
+    return DynamicMaskHead(cfg)
+class DynamicMaskHead(nn.Module):
+    def __init__(self, cfg):
+        super(DynamicMaskHead, self).__init__()
+        self.num_layers = cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS
+        self.channels = cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS
+        self.in_channels = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
+        self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
+        self.disable_rel_coords = cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS
+        self.cluster_weight = cfg.MODEL.CONDINST.MASK_HEAD.CLUSTER_WEIGHT
+        soi = [64,128,256,512,1024]
+        # self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2]))
+        self.register_buffer("sizes_of_interest", torch.tensor(soi))
+        weight_nums, bias_nums = [], []
+        for l in range(self.num_layers):
+            if l == 0:
+                if not self.disable_rel_coords:
+                    weight_nums.append((self.in_channels + 2) * self.channels)
+                else:
+                    weight_nums.append(self.in_channels * self.channels)
+                bias_nums.append(self.channels)
+            elif l == self.num_layers - 1:
+                weight_nums.append(self.channels * 1)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.channels * self.channels)
+                bias_nums.append(self.channels)
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+        stable_conv_1 = nn.Sequential(nn.Conv2d(10,8,kernel_size=3, stride=1, padding=1),nn.ReLU())
+        torch.nn.init.normal_(stable_conv_1[0].weight, std=0.01)
+        torch.nn.init.constant_(stable_conv_1[0].bias, 0)
+        stable_conv_2 = nn.Sequential(nn.Conv2d(8,8,kernel_size=3, stride=1, padding=1),nn.ReLU())
+        torch.nn.init.normal_(stable_conv_2[0].weight, std=0.01)
+        torch.nn.init.constant_(stable_conv_2[0].bias, 0)
+        stable_conv_3 = nn.Conv2d(8, 1, kernel_size=3, stride=1, padding=1)
+        torch.nn.init.normal_(stable_conv_3.weight, std=0.01)
+        torch.nn.init.constant_(stable_conv_3.bias, 0)
+        self.stable = nn.ModuleList([stable_conv_1, stable_conv_2, stable_conv_3])
+        self.general_choose = cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC
+        self.general_choose_weight = cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC_WEIGHT
+        self.key_weight = dict()
+        for key, value in zip(self.general_choose, self.general_choose_weight):
+            self.key_weight[key]=value
+    def mask_heads_forward(self, features, weights, biases, num_insts):
+        '''
+        :param features
+        :param weights: [w0, w1, ...]
+        :param bias: [b0, b1, ...]
+        :return:
+        '''
+        assert features.dim() == 4
+        n_layers = len(weights)
+        x = features
+        mid_features = []
+        for i, (w, b) in enumerate(zip(weights, biases)):
+            x = F.conv2d(x, w, bias=b, stride=1, padding=0, groups=num_insts)
+            if i < n_layers - 1:
+                x = F.relu(x)
+            mid_features.append(x)
+        return x, mid_features
+    def mask_heads_forward_split(self, features, weight, bias, num_insts, has_relu=True):
+        '''
+        :param features
+        :param weights: [w0, w1, ...]
+        :param bias: [b0, b1, ...]
+        :return:
+        '''
+        assert features.dim() == 4
+        # n_layers = len(weights)
+        x = features
+        x = F.conv2d(x, weight, bias=bias, stride=1, padding=0, groups=num_insts)
+        if has_relu:
+            x = F.relu(x)
+        return x
+    def mask_heads_forward_with_coords_test(self, mask_feats, mask_feat_stride, instances):
+        locations = compute_locations(mask_feats.size(2), mask_feats.size(3), stride=mask_feat_stride, device=mask_feats.device)
+        n_inst = len(instances)
+        im_inds = instances.im_inds
+        mask_head_params = instances.mask_head_params
+        N, _, H, W = mask_feats.size()
+        if not self.disable_rel_coords:
+            instance_locations = instances.locations
+            relative_coords = instance_locations.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
+            relative_coords = relative_coords.permute(0, 2, 1).float()
+            soi = self.sizes_of_interest.float()[instances.fpn_levels]
+            relative_coords = relative_coords / soi.reshape(-1, 1, 1)
+            relative_coords = relative_coords.to(dtype=mask_feats.dtype)
+            mask_head_inputs = torch.cat([
+                relative_coords, mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
+            ], dim=1)
+        else:
+            mask_head_inputs = mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
+        mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
+        weights, biases = parse_dynamic_params(
+            mask_head_params, self.channels,
+            self.weight_nums, self.bias_nums
+        )
+        mask_logits, mid_features = self.mask_heads_forward(mask_head_inputs, weights, biases, n_inst)
+        mask_logits = mask_logits.reshape(-1, 1, H, W)
+        assert mask_feat_stride >= self.mask_out_stride
+        assert mask_feat_stride % self.mask_out_stride == 0
+        mask_logits = aligned_bilinear(mask_logits, int(mask_feat_stride / self.mask_out_stride))
+        return mask_logits.sigmoid()
+    def mask_heads_forward_with_coords(self, mask_feats, mask_feat_stride, instances, gt_bitmasks, ignore_maps):
+        locations = compute_locations(mask_feats.size(2), mask_feats.size(3), stride=mask_feat_stride, device=mask_feats.device)
+        n_inst = len(instances)
+        im_inds = instances.im_inds
+        mask_head_params = instances.mask_head_params
+        # clusters
+        gt_inds = instances.gt_inds
+        instance_locations = instances.locations
+        fpn_levels = instances.fpn_levels
+        clusters_ids = []
+        clusters_imgids = []
+        clusters_gt_masks = []
+        gt_unique_inds = torch.unique(gt_inds)
+        for gt_ind in gt_unique_inds:
+            gt_ind = int(gt_ind)
+            clusters_gt_masks.append(gt_bitmasks[gt_ind])
+            im_ind = int(torch.unique(im_inds[(gt_inds == gt_ind)]))
+            clusters_ids.append(gt_ind)
+            clusters_imgids.append(im_ind)
+        clusters_ids = torch.tensor(clusters_ids).cuda()
+        clusters_imgids = torch.tensor(clusters_imgids)
+        clusters_gt_masks = torch.stack(clusters_gt_masks, dim=0)
+        n_clusters = len(clusters_ids)
+        N, _, H, W = mask_feats.size()
+        if not self.disable_rel_coords:
+            instance_locations = instances.locations
+            relative_coords = instance_locations.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
+            relative_coords = relative_coords.permute(0, 2, 1).float()
+            soi = self.sizes_of_interest.float()[instances.fpn_levels]
+            relative_coords = relative_coords / soi.reshape(-1, 1, 1)
+            relative_coords = relative_coords.to(dtype=mask_feats.dtype)
+            mask_head_inputs = torch.cat([relative_coords, mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)], dim=1)
+        else:
+            mask_head_inputs = mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
+        # mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
+        mask_head_inputs = mask_head_inputs.reshape(n_inst, self.in_channels+2, H, W)
+        weights, biases = parse_dynamic_params(mask_head_params, self.channels, self.weight_nums, self.bias_nums)
+        feature0  = self.stable[0](mask_head_inputs)
+        feature1  = self.mask_heads_forward_split(mask_head_inputs.reshape(1, -1, H, W), weights[0], biases[0], n_inst).reshape(n_inst, -1, H, W)
+        feature00 = self.stable[1](feature0)
+        feature01 = self.mask_heads_forward_split(feature0.reshape(1, -1, H, W), weights[1], biases[1], n_inst).reshape(n_inst, -1, H, W)
+        feature10 = self.stable[1](feature1)
+        feature11 = self.mask_heads_forward_split(feature1.reshape(1, -1, H, W), weights[1], biases[1], n_inst).reshape(n_inst, -1, H, W)
+        feature001 = self.mask_heads_forward_split(feature00.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
+        feature010 = self.stable[2](feature01)
+        feature011 = self.mask_heads_forward_split(feature01.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
+        feature100 = self.stable[2](feature10)
+        feature101 = self.mask_heads_forward_split(feature10.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
+        feature110 = self.stable[2](feature11)
+        feature111 = self.mask_heads_forward_split(feature11.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
+        mask_logits_clusters = []
+        for gt_ind in clusters_ids:
+            gt_ind = int(gt_ind)
+            mask_logits_clusters.append(torch.mean(feature111[gt_inds==gt_ind], dim=0))
+        mask_logits_clusters = torch.stack(mask_logits_clusters, dim=0)
+        mask_logits_clusters = mask_logits_clusters.reshape(-1, 1, H, W)
+        mask_logits_clusters = aligned_bilinear(mask_logits_clusters, int(mask_feat_stride / self.mask_out_stride))
+        # clusters
+        unique_img_inds = torch.unique(clusters_imgids)
+        mask_logits_clusters_imgs = []
+        mask_gt_clusters_imgs = []
+        for img_ind in unique_img_inds:
+            img_ind = int(img_ind)
+            mask_logits_clusters_per_img = mask_logits_clusters[clusters_imgids==img_ind]
+            mask_logits_clusters_per_img = F.softmax(mask_logits_clusters_per_img.squeeze(1),dim=0).unsqueeze(1)
+            ignore_map = ignore_maps[img_ind].detach()
+            finds_y, finds_x = torch.nonzero(ignore_map, as_tuple=True)
+            mask_logits_clusters_per_img = mask_logits_clusters_per_img.clone()
+            mask_logits_clusters_per_img[...,finds_y,finds_x] = 0
+            mask_logits_clusters_imgs.append(mask_logits_clusters_per_img)
+            mask_gt_clusters_imgs.append(clusters_gt_masks[clusters_imgids==img_ind])
+        mask_logits_clusters_imgs = torch.cat(mask_logits_clusters_imgs, dim=0)
+        mask_gt_clusters_imgs = torch.cat(mask_gt_clusters_imgs, dim=0)
+        select_features = {}
+        for cid in self.general_choose:
+            select_feature = locals()["feature{}".format(cid)]
+            select_feature = aligned_bilinear(select_feature, int(mask_feat_stride / self.mask_out_stride))
+            select_features[cid] = select_feature.sigmoid()
+        return select_features, mask_logits_clusters_imgs, mask_gt_clusters_imgs.unsqueeze(1)
+    def __call__(self, mask_feats, mask_feat_stride, pred_instances, gt_instances=None):
+        if self.training:
+            gt_inds = pred_instances.gt_inds
+            gt_bitmasks_s = torch.cat([per_im.gt_bitmasks for per_im in gt_instances])
+            gt_bitmasks = gt_bitmasks_s[gt_inds].unsqueeze(dim=1).to(dtype=mask_feats.dtype)
+            bitmasks_full = []
+            for gt_instance in gt_instances:
+                bitmasks_full.append(gt_instance.gt_bitmasks.sum(dim=0))
+            bitmasks_full = torch.stack(bitmasks_full)
+            ignore_map = 1-bitmasks_full
+            losses = {}
+            if len(pred_instances) == 0:
+                loss_mask = mask_feats.sum() * 0 + pred_instances.mask_head_params.sum() * 0
+                for key, value in self.key_weight.items():
+                    losses["loss_mask_bank_{}".format(key)] = loss_mask
+                losses["loss_mask_cluster"] = loss_mask
+            else:
+                select_scores, mask_logits_clusters, mask_gts_clusters = self.mask_heads_forward_with_coords(mask_feats, mask_feat_stride, pred_instances, gt_bitmasks_s, ignore_map)
+                for key, value in select_scores.items():
+                    losses["loss_mask_bank_{}".format(key)] = dice_coefficient(value, gt_bitmasks).mean() * self.key_weight[key]
+                mask_clusters_losses = dice_coefficient(mask_logits_clusters, mask_gts_clusters)
+                mask_clusters_losses = mask_clusters_losses.mean()
+                losses["loss_mask_cluster"] = mask_clusters_losses * self.cluster_weight
+            return losses
+        else:
+            if len(pred_instances) > 0:
+                mask_scores = self.mask_heads_forward_with_coords_test(mask_feats, mask_feat_stride, pred_instances)
+                pred_instances.pred_global_masks = mask_scores.float()
+            return pred_instances

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/mask_branch.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from typing import Dict
+import math
+import torch
+from torch import nn
+import pdb
+from fvcore.nn import sigmoid_focal_loss_jit
+from detectron2.layers import ShapeSpec
+from ..det_head.layers import conv_with_kaiming_uniform
+from ..det_head.utils.comm import aligned_bilinear
+INF = 100000000
+def build_mask_branch(cfg, input_shape):
+    return MaskBranch(cfg, input_shape)
+class MaskBranch(nn.Module):
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__()
+        self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES
+        self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON
+        self.num_outputs = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
+        norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM
+        num_convs = cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS
+        channels = cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS
+        self.out_stride = input_shape[self.in_features[0]].stride
+        feature_channels = {k: v.channels for k, v in input_shape.items()}
+        conv_block = conv_with_kaiming_uniform(norm, activation=True)
+        self.refine = nn.ModuleList()
+        for in_feature in self.in_features:
+            self.refine.append(conv_block(
+                feature_channels[in_feature],
+                channels, 3, 1
+            ))
+        tower = []
+        for i in range(num_convs):
+            tower.append(conv_block(
+                channels, channels, 3, 1
+            ))
+        tower.append(nn.Conv2d(
+            channels, max(self.num_outputs, 1), 1
+        ))
+        self.add_module('tower', nn.Sequential(*tower))
+    def forward(self, features, gt_instances=None):
+        for i, f in enumerate(self.in_features):
+            if i == 0:
+                x = self.refine[i](features[f])
+            else:
+                x_p = self.refine[i](features[f])
+                target_h, target_w = x.size()[2:]
+                h, w = x_p.size()[2:]
+                assert target_h % h == 0
+                assert target_w % w == 0
+                factor_h, factor_w = target_h // h, target_w // w
+                assert factor_h == factor_w
+                x_p = aligned_bilinear(x_p, factor_h)
+                x = x + x_p
+        mask_feats = self.tower(x)
+        if self.num_outputs == 0:
+            mask_feats = mask_feats[:, :self.num_outputs]
+        return mask_feats

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from torch.nn import functional as F
+import pdb
+def sigmoid_focal_loss_boundary(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    boundary: torch.Tensor,
+    alpha: float = -1,
+    gamma: float = 2,
+    reduction: str = "none",
+) -> torch.Tensor:
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+    Returns:
+        Loss tensor with the reduction option applied.
+    """
+    p = torch.sigmoid(inputs)
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = p * targets + (1 - p) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+    loss = loss * boundary
+    # pdb.set_trace()
+    if reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+    return loss
+sigmoid_focal_loss_boundary_jit = torch.jit.script(
+    sigmoid_focal_loss_boundary
+)  # type: torch.jit.ScriptModule

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/__init__.py ADDED Viewed

File without changes

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/deformable_conv_with_off.py ADDED Viewed

	@@ -0,0 +1,59 @@

+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+import torch
+import torch.nn as nn
+from detectron2.layers.deform_conv import DeformConv, ModulatedDeformConv
+class DeformConvWithOff(nn.Module):
+    def __init__(self, in_channels, out_channels,
+                 kernel_size=3, stride=1, padding=1,
+                 dilation=1, deformable_groups=1):
+        super(DeformConvWithOff, self).__init__()
+        self.offset_conv = nn.Conv2d(
+            in_channels,
+            deformable_groups * 2 * kernel_size * kernel_size,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        self.dcn = DeformConv(
+            in_channels, out_channels, kernel_size=kernel_size,
+            stride=stride, padding=padding, dilation=dilation,
+            deformable_groups=deformable_groups,
+        )
+    def forward(self, input):
+        offset = self.offset_conv(input)
+        output = self.dcn(input, offset)
+        return output
+class ModulatedDeformConvWithOff(nn.Module):
+    def __init__(self, in_channels, out_channels,
+                 kernel_size=3, stride=1, padding=1,
+                 dilation=1, deformable_groups=1,
+                 bias=True, norm=None, activation=None,):
+        super(ModulatedDeformConvWithOff, self).__init__()
+        self.offset_mask_conv = nn.Conv2d(
+            in_channels,
+            deformable_groups * 3 * kernel_size * kernel_size,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        self.dcnv2 = ModulatedDeformConv(
+            in_channels, out_channels, kernel_size=kernel_size,
+            stride=stride, padding=padding, dilation=dilation,
+            deformable_groups=deformable_groups,
+            bias=bias, norm=norm, activation=activation,
+        )
+    def forward(self, input):
+        x = self.offset_mask_conv(input)
+        o1, o2, mask = torch.chunk(x, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        output = self.dcnv2(input, offset, mask)
+        return output

MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/panopticfcn_head.py ADDED Viewed

	@@ -0,0 +1,190 @@

+#!/usr/bin/python3
+# -*- coding:utf-8 -*-
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from detectron2.layers import Conv2d, get_norm
+from .deformable_conv_with_off import ModulatedDeformConvWithOff
+from ..det_head.layers import conv_with_kaiming_uniform
+import math
+import pdb
+from fvcore.nn import sigmoid_focal_loss_jit
+class SingleHead(nn.Module):
+    """
+    Build single head with convolutions and coord conv.
+    """
+    def __init__(self, in_channel, conv_dims, num_convs, deform=False, coord=False, norm='', name=''):
+        super().__init__()
+        self.coord = coord
+        self.conv_norm_relus = []
+        if deform:
+            conv_module = ModulatedDeformConvWithOff
+        else:
+            conv_module = Conv2d
+        for k in range(num_convs):
+            conv = conv_module(
+                    in_channel if k==0 else conv_dims,
+                    conv_dims,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not norm,
+                    norm=get_norm(norm, conv_dims),
+                    activation=F.relu,
+                )
+            self.add_module("{}_head_{}".format(name, k + 1), conv)
+            self.conv_norm_relus.append(conv)
+    def forward(self, x):
+        if self.coord:
+            x = self.coord_conv(x)
+        for layer in self.conv_norm_relus:
+            x = layer(x)
+        return x
+    def coord_conv(self, feat):
+        with torch.no_grad():
+            x_pos = torch.linspace(-1, 1, feat.shape[-2], device=feat.device)
+            y_pos = torch.linspace(-1, 1, feat.shape[-1], device=feat.device)
+            grid_x, grid_y = torch.meshgrid(x_pos, y_pos)
+            grid_x = grid_x.unsqueeze(0).unsqueeze(1).expand(feat.shape[0], -1, -1, -1)
+            grid_y = grid_y.unsqueeze(0).unsqueeze(1).expand(feat.shape[0], -1, -1, -1)
+        feat = torch.cat([feat, grid_x, grid_y], dim=1)
+        return feat
+class KernelHead(nn.Module):
+    """
+    The head used in PanopticFCN to generate kernel weights for both Things and Stuff.
+    """
+    def __init__(self, cfg, num_gen_params):
+        super().__init__()
+        in_channel      = cfg.MODEL.FPN.OUT_CHANNELS
+        conv_dims       = cfg.MODEL.KERNEL_HEAD.CONVS_DIM
+        num_convs       = cfg.MODEL.KERNEL_HEAD.NUM_CONVS
+        deform          = cfg.MODEL.KERNEL_HEAD.DEFORM
+        coord           = cfg.MODEL.KERNEL_HEAD.COORD
+        norm            = cfg.MODEL.KERNEL_HEAD.NORM
+        self.num_gen_params = num_gen_params
+        self.kernel_head = SingleHead(in_channel+2 if coord else in_channel,
+                                      conv_dims,
+                                      num_convs,
+                                      deform=deform,
+                                      coord=coord,
+                                      norm=norm,
+                                      name='kernel_head')
+        self.out_conv = Conv2d(conv_dims, self.num_gen_params, kernel_size=3, padding=1)
+        nn.init.normal_(self.out_conv.weight, mean=0, std=0.01)
+        if self.out_conv.bias is not None:
+            nn.init.constant_(self.out_conv.bias, 0)
+    def forward(self, feat):
+        x = self.kernel_head(feat)
+        x = self.out_conv(x)
+        return x
+class FeatureEncoder(nn.Module):
+    """
+    The head used in PanopticFCN for high-resolution feature generation.
+    """
+    def __init__(self, cfg):
+        super().__init__()
+        in_channel      = cfg.MODEL.SEMANTIC_FPN.CONVS_DIM
+        conv_dims       = cfg.MODEL.FEATURE_ENCODER.CONVS_DIM
+        num_convs       = cfg.MODEL.FEATURE_ENCODER.NUM_CONVS
+        deform          = cfg.MODEL.FEATURE_ENCODER.DEFORM
+        coord           = cfg.MODEL.FEATURE_ENCODER.COORD
+        norm            = cfg.MODEL.FEATURE_ENCODER.NORM
+        self.encode_head = SingleHead(in_channel+2 if coord else in_channel,
+                                      conv_dims,
+                                      num_convs,
+                                      deform=deform,
+                                      coord=coord,
+                                      norm=norm,
+                                      name='encode_head')
+    def forward(self, feat):
+        feat = self.encode_head(feat)
+        return feat
+class FeatureEncoderEdge(nn.Module):
+    """
+    The head used in PanopticFCN for high-resolution feature generation.
+    """
+    def __init__(self, cfg):
+        super().__init__()
+        in_channel      = cfg.MODEL.SEMANTIC_FPN.CONVS_DIM
+        conv_dims       = cfg.MODEL.FEATURE_ENCODER.CONVS_DIM
+        num_convs       = cfg.MODEL.FEATURE_ENCODER.NUM_CONVS
+        deform          = cfg.MODEL.FEATURE_ENCODER.DEFORM
+        coord           = cfg.MODEL.FEATURE_ENCODER.COORD
+        norm            = cfg.MODEL.FEATURE_ENCODER.NORM
+        self.encode_head = SingleHead(in_channel+2 if coord else in_channel,
+                                      conv_dims,
+                                      num_convs,
+                                      deform=deform,
+                                      coord=coord,
+                                      norm=norm,
+                                      name='encode_head')
+        self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES
+        self.out_stride  = 8
+        norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM
+        conv_block = conv_with_kaiming_uniform(norm, activation=True)
+        self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON
+        if self.sem_loss_on:
+            self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA
+            self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA
+            # in_channels = feature_channels[self.in_features[0]]
+            self.seg_head = nn.Sequential(
+                conv_block(conv_dims, conv_dims, kernel_size=3, stride=1),
+                conv_block(conv_dims, conv_dims, kernel_size=3, stride=1)
+            )
+            self.logits = nn.Conv2d(conv_dims, 1, kernel_size=1, stride=1)
+            prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
+            bias_value = -math.log((1 - prior_prob) / prior_prob)
+            torch.nn.init.constant_(self.logits.bias, bias_value)
+    def forward(self, feat, gt_instances=None):
+        feat = self.encode_head(feat)
+        losses = {}
+        # auxiliary thing semantic loss
+        if self.training and self.sem_loss_on:
+            logits_pred = self.logits(self.seg_head(feat))
+            boundary_targets = []
+            for per_im_gt in gt_instances:
+                boundary_targets.append(per_im_gt.gt_boundary_full.sum(dim=0))
+            # # semantic_targets = torch.stack(semantic_targets, dim=0)
+            boundary_targets = torch.stack(boundary_targets, dim=0)
+            # resize target to reduce memory
+            boundary_targets = boundary_targets[:, None, self.out_stride // 2::self.out_stride,self.out_stride // 2::self.out_stride]
+            num_pos = (boundary_targets > 0).sum().float().clamp(min=1.0)
+            loss_edge = sigmoid_focal_loss_jit(logits_pred, boundary_targets, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum") / num_pos
+            losses['loss_edge_p3'] = loss_edge
+        return feat, losses
+def build_feature_encoder(cfg, input_shape=None):
+    return FeatureEncoder(cfg)
+def build_feature_encoder_edge(cfg, input_shape=None):
+    return FeatureEncoderEdge(cfg)
+def build_kernel_head(cfg, num_gen_params):
+    return KernelHead(cfg, num_gen_params)

MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/entity_to_json.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import copy
+import mmcv
+import numpy as np
+import pdb
+import pycocotools.mask as mask_utils
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+prefix = "train2017"
+base_path = "/data/ceph/gavinqi/data/coco"
+entity_base_path   = os.path.join(base_path, "entity_{}".format(prefix))
+annotation_path               = os.path.join(base_path, "annotations/instances_{}.json".format(prefix))
+save_thing_path               = os.path.join(base_path, "annotations/entity_thing_{}.json".format(prefix))
+save_stuff_path               = os.path.join(base_path, "annotations/entity_stuff_{}.json".format(prefix))
+save_entity_path              = os.path.join(base_path, "annotations/entity_{}.json".format(prefix))
+## build catid to continous
+categories_list               = COCO_CATEGORIES
+catid_map                     = {category['id']: [cid, category["isthing"], category["name"], category["supercategory"]] for cid, category in enumerate(categories_list)}
+idcat_map = {}
+for key, value in catid_map.items():
+    idcat_map[value[0]] = [key,value[1]]
+instance_annotations       = mmcv.load(annotation_path)
+instance_annotations_thing = copy.deepcopy(instance_annotations)
+instance_annotations_stuff = copy.deepcopy(instance_annotations)
+# update category
+print("Updating categories...")
+instance_annotations_thing["categories"] = []
+instance_annotations_stuff["categories"] = []
+for origin_catid, new_catid_info in catid_map.items():
+	new_catid = new_catid_info[0]
+	is_thing  = new_catid_info[1]
+	name      = new_catid_info[2]
+	nsuper    = new_catid_info[3]
+	if is_thing:
+		instance_annotations_thing["categories"].append({"supercategory": nsuper, "id": new_catid, "name": name})
+	else:
+		instance_annotations_stuff["categories"].append({"supercategory": nsuper, "id": new_catid, "name": name})
+print("Update category finished")
+# update annotations
+instance_annotations_thing["annotations"] = []
+instance_annotations_stuff["annotations"] = []
+npz_names = os.listdir(entity_base_path)
+thing_id  = 0
+stuff_id  = 0
+for index, npz_name in enumerate(npz_names):
+    entity_info = np.load(os.path.join(entity_base_path, npz_name))
+    image_id = int(npz_name.split(".")[0])
+    bounding_boxes = entity_info["bounding_box"]
+    entity_id_map = entity_info["map"]
+    entity_id_map = entity_id_map[0]
+    if len(bounding_boxes)==0:
+        continue
+    # 0-x1, 1-y1, 2-x2, 3-y2, 4-category, 5-thing_or_stuff, 6-entity_id
+    thing_mask  = bounding_boxes[:,5] > 0
+    stuff_mask  = bounding_boxes[:,5] == 0
+	# begin thing
+    thing_boxes = bounding_boxes[thing_mask]
+    for thing_box in thing_boxes:
+        x1, y1, x2, y2, category_id, thing_or_stuff, entity_id = thing_box
+        area = (y2-y1) * (x2-x1)
+        if "val" in prefix:
+            mask = (entity_id_map==entity_id)
+            mask = np.array(mask, order="F", dtype="uint8")
+            rle  = mask_utils.encode(mask)
+            rle["counts"] = rle["counts"].decode("utf-8")
+        anno = {"iscrowd": 0,
+                "area": area,
+                "image_id": image_id,
+                "bbox": [x1, y1, x2-x1, y2-y1],
+                "category_id": category_id,
+                "id": thing_id}
+        if "val" in prefix:
+            anno["segmentation"]=rle
+        instance_annotations_thing["annotations"].append(anno)
+        thing_id = thing_id + 1
+    # begin stuff
+    stuff_boxes = bounding_boxes[stuff_mask]
+    for stuff_box in stuff_boxes:
+        x1, y1, x2, y2, category_id, thing_or_stuff, entity_id = stuff_box
+        area = (y2-y1) * (x2-x1)
+        if "val" in prefix:
+            mask = (entity_id_map==entity_id)
+            mask = np.array(mask, order="F", dtype="uint8")
+            rle  = mask_utils.encode(mask)
+            rle["counts"] = rle["counts"].decode("utf-8")
+        anno = {"iscrowd": 0,
+                "area": area,
+                "image_id": image_id,
+                "bbox": [x1, y1, x2-x1, y2-y1],
+                "category_id": category_id,
+                "id": stuff_id}
+        if "val" in prefix:
+            anno["segmentation"]=rle
+        instance_annotations_stuff["annotations"].append(anno)
+        stuff_id = stuff_id + 1
+    print("{},{}".format(index, npz_name))
+mmcv.dump(instance_annotations_thing, save_thing_path)
+mmcv.dump(instance_annotations_stuff, save_stuff_path)
+thing_info = instance_annotations_thing
+stuff_info = instance_annotations_stuff
+thst       = thing_info
+thst["categories"].extend(stuff_info["categories"])
+nums = len(thst["annotations"]) + 1
+for index, anno in enumerate(stuff_info["annotations"]):
+	anno["id"] = index + nums
+	thst["annotations"].append(anno)
+mmcv.dump(thst, save_entity_path)

MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import os, sys
+import numpy as np
+import pdb
+import mmcv
+import copy
+import cv2
+from collections import OrderedDict
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_utils
+import PIL.Image as Image
+import matplotlib.pyplot as plt
+from skimage.segmentation import find_boundaries
+from panopticapi.utils import IdGenerator, rgb2id
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+thread_num   = int(sys.argv[1])
+thread_idx   = int(sys.argv[2])
+type_        = sys.argv[3]
+OFFSET = 256 * 256 * 256
+GT_base_path             = "/data/ceph/gavinqi/data/coco"
+GT_panoptic_png_path     = os.path.join(GT_base_path, "panoptic_{}".format(type_))
+GT_panoptic_json_path    = os.path.join(GT_base_path, "annotations/panoptic_{}.json".format(type_))
+GT_instance_json_path    = os.path.join(GT_base_path, "annotations/instances_{}.json".format(type_))
+save_base_path           = os.path.join(GT_base_path, "entity_{}".format(type_))
+if not os.path.exists(save_base_path):
+    os.makedirs(save_base_path)
+coco_g          = mmcv.load(GT_panoptic_json_path)
+categories_list = COCO_CATEGORIES
+catid_map       = {category['id']: [cid, category["isthing"]] for cid, category in enumerate(categories_list)}
+idcat_map = {}
+for key, value in catid_map.items():
+    idcat_map[value[0]] = [key,value[1]]
+name2panopticindex = OrderedDict()
+id2name            = OrderedDict()
+for i_index, image_info in enumerate(coco_g["images"]):
+    file_name = image_info["file_name"].split(".")[0]
+    name2panopticindex[file_name] = {"i_index": i_index}
+    id2name[image_info["id"]] = file_name
+for a_index, ann in enumerate(coco_g["annotations"]):
+    file_name = id2name[ann["image_id"]]
+    name2panopticindex[file_name]["a_index"] = a_index
+print("build name to panoptic index finished")
+# imgs and instance_anns
+instances_api      = COCO(GT_instance_json_path)
+img_ids            = instances_api.getImgIds()
+imgs               = instances_api.loadImgs(img_ids)
+instance_anns      = [instances_api.imgToAnns[img_id] for img_id in img_ids]
+assert len(name2panopticindex.keys()) == len(imgs)
+imgs_instancesanns = list(zip(imgs, instance_anns))
+print("build imgs and instance_anns finished")
+for img_index, (img_dict, ann_dict_list) in enumerate(imgs_instancesanns):
+    if img_index % thread_num != thread_idx:
+        continue
+    file_name        = img_dict["file_name"].split(".")[0]
+    image_h, image_w = img_dict["height"], img_dict["width"]
+    ## panoptic mask from panoptic annotation
+    panoptic_i_index, panoptic_a_index = name2panopticindex[file_name]["i_index"], name2panopticindex[file_name]["a_index"]
+    panoptic_img_infos = coco_g["images"][panoptic_i_index]
+    panoptic_ann_infos = coco_g["annotations"][panoptic_a_index]
+    assert panoptic_img_infos["file_name"].split(".")[0] == file_name, "Something wrong with panoptic_img_infos"
+    assert panoptic_ann_infos["file_name"].split(".")[0] == file_name, "Something wrong with panoptic_ann_infos"
+    panoptic              = np.array(Image.open(os.path.join(GT_panoptic_png_path, file_name+".png")), dtype=np.uint8)
+    panoptic_id           = rgb2id(panoptic)
+    panoptic_entity_id    = np.zeros(panoptic_id.shape, dtype=np.uint8)
+    panoptic_class_id     = np.zeros(panoptic_id.shape, dtype=np.uint8) + 255
+    unique_panoptic_id    = np.unique(panoptic_id)
+    for ii, segment_info in enumerate(panoptic_ann_infos["segments_info"]):
+        if segment_info["iscrowd"] == 1:
+            continue
+        old_entity_id     = segment_info["id"]
+        new_entity_id     = ii + 1
+        category          = segment_info["category_id"]
+        panoptic_entity_id[panoptic_id==old_entity_id] = new_entity_id
+        panoptic_class_id[panoptic_id==old_entity_id]  = catid_map[category][0]
+    unique_ids        = np.unique(panoptic_entity_id)
+    count = 1
+    bounding_box = []
+    for entity_id in unique_ids:
+        if entity_id == 0:
+            continue
+        mask     = (panoptic_entity_id==entity_id).astype(np.uint8)
+        category = int(np.unique(panoptic_class_id[panoptic_entity_id==entity_id]))
+        finds_y, finds_x = np.where(mask==1)
+        y1 = int(np.min(finds_y))
+        y2 = int(np.max(finds_y))
+        x1 = int(np.min(finds_x))
+        x2 = int(np.max(finds_x))
+        thing_or_stuff = int(idcat_map[category][1])
+        bounding_box.append([x1,y1,x2,y2,category,thing_or_stuff,entity_id])
+    bounding_box = np.array(bounding_box)
+    panoptic_info = np.stack((panoptic_entity_id, panoptic_class_id), axis=0)
+    np.savez(os.path.join(save_base_path, file_name),map=panoptic_info, bounding_box=bounding_box)
+    print("{}, {}, {}".format(thread_idx, img_index, file_name))

MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/bash
+thread_num=8
+for((i=0;i<${thread_num};i++));do
+{
+	python3 make_entity_mask.py ${thread_num} ${i} train2017
+}&
+done
+wait

MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/Makefile ADDED Viewed

	@@ -0,0 +1,9 @@

+all:
+    # install pycocotools locally
+	python setup.py build_ext --inplace
+	rm -rf build
+install:
+	# install pycocotools to the Python site-packages
+	python setup.py build_ext install
+	rm -rf build

MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __author__ = 'tylin'

MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.c ADDED Viewed

The diff for this file is too large to render. See raw diff

MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.pyx ADDED Viewed

	@@ -0,0 +1,308 @@

+# distutils: language = c
+# distutils: sources = ../common/maskApi.c
+#**************************************************************************
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+# Licensed under the Simplified BSD License [see coco/license.txt]
+#**************************************************************************
+__author__ = 'tsungyi'
+import sys
+PYTHON_VERSION = sys.version_info[0]
+# import both Python-level and C-level symbols of Numpy
+# the API uses Numpy to interface C and Python
+import numpy as np
+cimport numpy as np
+from libc.stdlib cimport malloc, free
+# intialized Numpy. must do.
+np.import_array()
+# import numpy C function
+# we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management
+cdef extern from "numpy/arrayobject.h":
+    void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
+# Declare the prototype of the C functions in MaskApi.h
+cdef extern from "maskApi.h":
+    ctypedef unsigned int uint
+    ctypedef unsigned long siz
+    ctypedef unsigned char byte
+    ctypedef double* BB
+    ctypedef struct RLE:
+        siz h,
+        siz w,
+        siz m,
+        uint* cnts,
+    void rlesInit( RLE **R, siz n )
+    void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n )
+    void rleDecode( const RLE *R, byte *mask, siz n )
+    void rleMerge( const RLE *R, RLE *M, siz n, int intersect )
+    void rleArea( const RLE *R, siz n, uint *a )
+    void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o )
+    void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o )
+    void rleToBbox( const RLE *R, BB bb, siz n )
+    void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n )
+    void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w )
+    char* rleToString( const RLE *R )
+    void rleFrString( RLE *R, char *s, siz h, siz w )
+# python class to wrap RLE array in C
+# the class handles the memory allocation and deallocation
+cdef class RLEs:
+    cdef RLE *_R
+    cdef siz _n
+    def __cinit__(self, siz n =0):
+        rlesInit(&self._R, n)
+        self._n = n
+    # free the RLE array here
+    def __dealloc__(self):
+        if self._R is not NULL:
+            for i in range(self._n):
+                free(self._R[i].cnts)
+            free(self._R)
+    def __getattr__(self, key):
+        if key == 'n':
+            return self._n
+        raise AttributeError(key)
+# python class to wrap Mask array in C
+# the class handles the memory allocation and deallocation
+cdef class Masks:
+    cdef byte *_mask
+    cdef siz _h
+    cdef siz _w
+    cdef siz _n
+    def __cinit__(self, h, w, n):
+        self._mask = <byte*> malloc(h*w*n* sizeof(byte))
+        self._h = h
+        self._w = w
+        self._n = n
+    # def __dealloc__(self):
+        # the memory management of _mask has been passed to np.ndarray
+        # it doesn't need to be freed here
+    # called when passing into np.array() and return an np.ndarray in column-major order
+    def __array__(self):
+        cdef np.npy_intp shape[1]
+        shape[0] = <np.npy_intp> self._h*self._w*self._n
+        # Create a 1D array, and reshape it to fortran/Matlab column-major array
+        ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
+        # The _mask allocated by Masks is now handled by ndarray
+        PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
+        return ndarray
+# internal conversion from Python RLEs object to compressed RLE format
+def _toString(RLEs Rs):
+    cdef siz n = Rs.n
+    cdef bytes py_string
+    cdef char* c_string
+    objs = []
+    for i in range(n):
+        c_string = rleToString( <RLE*> &Rs._R[i] )
+        py_string = c_string
+        objs.append({
+            'size': [Rs._R[i].h, Rs._R[i].w],
+            'counts': py_string
+        })
+        free(c_string)
+    return objs
+# internal conversion from compressed RLE format to Python RLEs object
+def _frString(rleObjs):
+    cdef siz n = len(rleObjs)
+    Rs = RLEs(n)
+    cdef bytes py_string
+    cdef char* c_string
+    for i, obj in enumerate(rleObjs):
+        if PYTHON_VERSION == 2:
+            py_string = str(obj['counts']).encode('utf8')
+        elif PYTHON_VERSION == 3:
+            py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
+        else:
+            raise Exception('Python version must be 2 or 3')
+        c_string = py_string
+        rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
+    return Rs
+# encode mask to RLEs objects
+# list of RLE string can be generated by RLEs member function
+def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
+    h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
+    cdef RLEs Rs = RLEs(n)
+    rleEncode(Rs._R,<byte*>mask.data,h,w,n)
+    objs = _toString(Rs)
+    return objs
+# decode mask from compressed list of RLE string or RLEs object
+def decode(rleObjs):
+    cdef RLEs Rs = _frString(rleObjs)
+    h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
+    masks = Masks(h, w, n)
+    rleDecode(<RLE*>Rs._R, masks._mask, n);
+    return np.array(masks)
+def merge(rleObjs, intersect=0):
+    cdef RLEs Rs = _frString(rleObjs)
+    cdef RLEs R = RLEs(1)
+    rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
+    obj = _toString(R)[0]
+    return obj
+def area(rleObjs):
+    cdef RLEs Rs = _frString(rleObjs)
+    cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
+    rleArea(Rs._R, Rs._n, _a)
+    cdef np.npy_intp shape[1]
+    shape[0] = <np.npy_intp> Rs._n
+    a = np.array((Rs._n, ), dtype=np.uint8)
+    a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
+    PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
+    return a
+# iou computation. support function overload (RLEs-RLEs and bbox-bbox).
+def iou( dt, gt, pyiscrowd ):
+    def _preproc(objs):
+        if len(objs) == 0:
+            return objs
+        if type(objs) == np.ndarray:
+            if len(objs.shape) == 1:
+                objs = objs.reshape((objs[0], 1))
+            # check if it's Nx4 bbox
+            if not len(objs.shape) == 2 or not objs.shape[1] == 4:
+                raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
+            objs = objs.astype(np.double)
+        elif type(objs) == list:
+            # check if list is in box format and convert it to np.ndarray
+            isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
+            isrle = np.all(np.array([type(obj) == dict for obj in objs]))
+            if isbox:
+                objs = np.array(objs, dtype=np.double)
+                if len(objs.shape) == 1:
+                    objs = objs.reshape((1,objs.shape[0]))
+            elif isrle:
+                objs = _frString(objs)
+            else:
+                raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
+        else:
+            raise Exception('unrecognized type.  The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
+        return objs
+    def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t,  ndim=1] _iou):
+        rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
+    def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
+        bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
+    def _len(obj):
+        cdef siz N = 0
+        if type(obj) == RLEs:
+            N = obj.n
+        elif len(obj)==0:
+            pass
+        elif type(obj) == np.ndarray:
+            N = obj.shape[0]
+        return N
+    # convert iscrowd to numpy array
+    cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)
+    # simple type checking
+    cdef siz m, n
+    dt = _preproc(dt)
+    gt = _preproc(gt)
+    m = _len(dt)
+    n = _len(gt)
+    if m == 0 or n == 0:
+        return []
+    if not type(dt) == type(gt):
+        raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
+    # define local variables
+    cdef double* _iou = <double*> 0
+    cdef np.npy_intp shape[1]
+    # check type and assign iou function
+    if type(dt) == RLEs:
+        _iouFun = _rleIou
+    elif type(dt) == np.ndarray:
+        _iouFun = _bbIou
+    else:
+        raise Exception('input data type not allowed.')
+    _iou = <double*> malloc(m*n* sizeof(double))
+    iou = np.zeros((m*n, ), dtype=np.double)
+    shape[0] = <np.npy_intp> m*n
+    iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
+    PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
+    _iouFun(dt, gt, iscrowd, m, n, iou)
+    return iou.reshape((m,n), order='F')
+def toBbox( rleObjs ):
+    cdef RLEs Rs = _frString(rleObjs)
+    cdef siz n = Rs.n
+    cdef BB _bb = <BB> malloc(4*n* sizeof(double))
+    rleToBbox( <const RLE*> Rs._R, _bb, n )
+    cdef np.npy_intp shape[1]
+    shape[0] = <np.npy_intp> 4*n
+    bb = np.array((1,4*n), dtype=np.double)
+    bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
+    PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
+    return bb
+def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
+    cdef siz n = bb.shape[0]
+    Rs = RLEs(n)
+    rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
+    objs = _toString(Rs)
+    return objs
+def frPoly( poly, siz h, siz w ):
+    cdef np.ndarray[np.double_t, ndim=1] np_poly
+    n = len(poly)
+    Rs = RLEs(n)
+    for i, p in enumerate(poly):
+        np_poly = np.array(p, dtype=np.double, order='F')
+        rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
+    objs = _toString(Rs)
+    return objs
+def frUncompressedRLE(ucRles, siz h, siz w):
+    cdef np.ndarray[np.uint32_t, ndim=1] cnts
+    cdef RLE R
+    cdef uint *data
+    n = len(ucRles)
+    objs = []
+    for i in range(n):
+        Rs = RLEs(1)
+        cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
+        # time for malloc can be saved here but it's fine
+        data = <uint*> malloc(len(cnts)* sizeof(uint))
+        for j in range(len(cnts)):
+            data[j] = <uint> cnts[j]
+        R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
+        Rs._R[0] = R
+        objs.append(_toString(Rs)[0])
+    return objs
+def frPyObjects(pyobj, h, w):
+    # encode rle from a list of python objects
+    if type(pyobj) == np.ndarray:
+        objs = frBbox(pyobj, h, w)
+    elif type(pyobj) == list and len(pyobj[0]) == 4:
+        objs = frBbox(pyobj, h, w)
+    elif type(pyobj) == list and len(pyobj[0]) > 4:
+        objs = frPoly(pyobj, h, w)
+    elif type(pyobj) == list and type(pyobj[0]) == dict \
+        and 'counts' in pyobj[0] and 'size' in pyobj[0]:
+        objs = frUncompressedRLE(pyobj, h, w)
+    # encode rle from single python object
+    elif type(pyobj) == list and len(pyobj) == 4:
+        objs = frBbox([pyobj], h, w)[0]
+    elif type(pyobj) == list and len(pyobj) > 4:
+        objs = frPoly([pyobj], h, w)[0]
+    elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
+        objs = frUncompressedRLE([pyobj], h, w)[0]
+    else:
+        raise Exception('input type is not supported.')
+    return objs

MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/coco.py ADDED Viewed

	@@ -0,0 +1,453 @@

+__author__ = 'tylin'
+__version__ = '2.0'
+# Interface for accessing the Microsoft COCO dataset.
+# Microsoft COCO is a large image dataset designed for object detection,
+# segmentation, and caption generation. pycocotools is a Python API that
+# assists in loading, parsing and visualizing the annotations in COCO.
+# Please visit http://mscoco.org/ for more information on COCO, including
+# for the data, paper, and tutorials. The exact format of the annotations
+# is also described on the COCO website. For example usage of the pycocotools
+# please see pycocotools_demo.ipynb. In addition to this API, please download both
+# the COCO images and annotations in order to run the demo.
+# An alternative to using the API is to load the annotations directly
+# into Python dictionary
+# Using the API provides additional utility functions. Note that this API
+# supports both *instance* and *caption* annotations. In the case of
+# captions not all functions are defined (e.g. categories are undefined).
+# The following API functions are defined:
+#  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
+#  decodeMask - Decode binary mask M encoded via run-length encoding.
+#  encodeMask - Encode binary mask M using run-length encoding.
+#  getAnnIds  - Get ann ids that satisfy given filter conditions.
+#  getCatIds  - Get cat ids that satisfy given filter conditions.
+#  getImgIds  - Get img ids that satisfy given filter conditions.
+#  loadAnns   - Load anns with the specified ids.
+#  loadCats   - Load cats with the specified ids.
+#  loadImgs   - Load imgs with the specified ids.
+#  annToMask  - Convert segmentation in an annotation to binary mask.
+#  showAnns   - Display the specified annotations.
+#  loadRes    - Load algorithm results and create API for accessing them.
+#  download   - Download COCO images from mscoco.org server.
+# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
+# Help on each functions can be accessed by: "help COCO>function".
+# See also COCO>decodeMask,
+# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
+# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
+# COCO>loadImgs, COCO>annToMask, COCO>showAnns
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
+# Licensed under the Simplified BSD License [see bsd.txt]
+import json
+import time
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+import numpy as np
+import copy
+import itertools
+from . import mask as maskUtils
+import os
+from collections import defaultdict
+import sys
+PYTHON_VERSION = sys.version_info[0]
+if PYTHON_VERSION == 2:
+    from urllib import urlretrieve
+elif PYTHON_VERSION == 3:
+    from urllib.request import urlretrieve
+def _isArrayLike(obj):
+    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+class COCO:
+    def __init__(self, annotation_file=None, class_agnostic=False):
+        """
+        Constructor of Microsoft COCO helper class for reading and visualizing annotations.
+        :param annotation_file (str): location of annotation file
+        :param image_folder (str): location to the folder that hosts images.
+        :return:
+        """
+        # load dataset
+        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
+        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
+        if not annotation_file == None:
+            print('loading annotations into memory...')
+            tic = time.time()
+            dataset = json.load(open(annotation_file, 'r'))
+            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
+            print('Done (t={:0.2f}s)'.format(time.time()- tic))
+            self.dataset = dataset
+            if class_agnostic:
+                self.dataset = self.to_agnostic(dataset)
+            else:
+                self.dataset = dataset
+            self.createIndex()
+    def to_agnostic(self,dataset):
+        # dataset["categories"] = ["supercategory": "thing", "id":1, "name": "thing"]
+        dataset["categories"] = [{"supercategory": "thing", "id":1, "name": "thing"}]
+        nums = len(dataset["annotations"])
+        for ii in range(nums):
+            dataset["annotations"][ii]["category_id"] = 1
+        return dataset
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        anns, cats, imgs = {}, {}, {}
+        imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                imgToAnns[ann['image_id']].append(ann)
+                anns[ann['id']] = ann
+        if 'images' in self.dataset:
+            for img in self.dataset['images']:
+                imgs[img['id']] = img
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                catToImgs[ann['category_id']].append(ann['image_id'])
+        print('index created!')
+        # create class members
+        self.anns = anns
+        self.imgToAnns = imgToAnns
+        self.catToImgs = catToImgs
+        self.imgs = imgs
+        self.cats = cats
+    def info(self):
+        """
+        Print information about the annotation file.
+        :return:
+        """
+        for key, value in self.dataset['info'].items():
+            print('{}: {}'.format(key, value))
+    def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
+        """
+        Get ann ids that satisfy given filter conditions. default skips that filter
+        :param imgIds  (int array)     : get anns for given imgs
+               catIds  (int array)     : get anns for given cats
+               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
+               iscrowd (boolean)       : get anns for given crowd label (False or True)
+        :return: ids (int array)       : integer array of ann ids
+        """
+        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+        if len(imgIds) == len(catIds) == len(areaRng) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(imgIds) == 0:
+                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.dataset['annotations']
+            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
+            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
+        if not iscrowd == None:
+            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+        else:
+            ids = [ann['id'] for ann in anns]
+        return ids
+    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
+        """
+        filtering parameters. default skips that filter.
+        :param catNms (str array)  : get cats for given cat names
+        :param supNms (str array)  : get cats for given supercategory names
+        :param catIds (int array)  : get cats for given cat ids
+        :return: ids (int array)   : integer array of cat ids
+        """
+        catNms = catNms if _isArrayLike(catNms) else [catNms]
+        supNms = supNms if _isArrayLike(supNms) else [supNms]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+        if len(catNms) == len(supNms) == len(catIds) == 0:
+            cats = self.dataset['categories']
+        else:
+            cats = self.dataset['categories']
+            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
+            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
+            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
+        ids = [cat['id'] for cat in cats]
+        return ids
+    def getImgIds(self, imgIds=[], catIds=[]):
+        '''
+        Get img ids that satisfy given filter conditions.
+        :param imgIds (int array) : get imgs for given ids
+        :param catIds (int array) : get imgs with all given cats
+        :return: ids (int array)  : integer array of img ids
+        '''
+        imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+        if len(imgIds) == len(catIds) == 0:
+            ids = self.imgs.keys()
+        else:
+            ids = set(imgIds)
+            for i, catId in enumerate(catIds):
+                if i == 0 and len(ids) == 0:
+                    ids = set(self.catToImgs[catId])
+                else:
+                    ids &= set(self.catToImgs[catId])
+        return list(ids)
+    def loadAnns(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying anns
+        :return: anns (object array) : loaded ann objects
+        """
+        if _isArrayLike(ids):
+            return [self.anns[id] for id in ids]
+        elif type(ids) == int:
+            return [self.anns[ids]]
+    def loadCats(self, ids=[]):
+        """
+        Load cats with the specified ids.
+        :param ids (int array)       : integer ids specifying cats
+        :return: cats (object array) : loaded cat objects
+        """
+        if _isArrayLike(ids):
+            return [self.cats[id] for id in ids]
+        elif type(ids) == int:
+            return [self.cats[ids]]
+    def loadImgs(self, ids=[]):
+        """
+        Load anns with the specified ids.
+        :param ids (int array)       : integer ids specifying img
+        :return: imgs (object array) : loaded img objects
+        """
+        if _isArrayLike(ids):
+            return [self.imgs[id] for id in ids]
+        elif type(ids) == int:
+            return [self.imgs[ids]]
+    def showAnns(self, anns, draw_bbox=False):
+        """
+        Display the specified annotations.
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+            return 0
+        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
+            datasetType = 'instances'
+        elif 'caption' in anns[0]:
+            datasetType = 'captions'
+        else:
+            raise Exception('datasetType not supported')
+        if datasetType == 'instances':
+            ax = plt.gca()
+            ax.set_autoscale_on(False)
+            polygons = []
+            color = []
+            for ann in anns:
+                c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
+                if 'segmentation' in ann:
+                    if type(ann['segmentation']) == list:
+                        # polygon
+                        for seg in ann['segmentation']:
+                            poly = np.array(seg).reshape((int(len(seg)/2), 2))
+                            polygons.append(Polygon(poly))
+                            color.append(c)
+                    else:
+                        # mask
+                        t = self.imgs[ann['image_id']]
+                        if type(ann['segmentation']['counts']) == list:
+                            rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
+                        else:
+                            rle = [ann['segmentation']]
+                        m = maskUtils.decode(rle)
+                        img = np.ones( (m.shape[0], m.shape[1], 3) )
+                        if ann['iscrowd'] == 1:
+                            color_mask = np.array([2.0,166.0,101.0])/255
+                        if ann['iscrowd'] == 0:
+                            color_mask = np.random.random((1, 3)).tolist()[0]
+                        for i in range(3):
+                            img[:,:,i] = color_mask[i]
+                        ax.imshow(np.dstack( (img, m*0.5) ))
+                if 'keypoints' in ann and type(ann['keypoints']) == list:
+                    # turn skeleton into zero-based index
+                    sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
+                    kp = np.array(ann['keypoints'])
+                    x = kp[0::3]
+                    y = kp[1::3]
+                    v = kp[2::3]
+                    for sk in sks:
+                        if np.all(v[sk]>0):
+                            plt.plot(x[sk],y[sk], linewidth=3, color=c)
+                    plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
+                    plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
+                if draw_bbox:
+                    [bbox_x, bbox_y, bbox_w, bbox_h] = ann['bbox']
+                    poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
+                    np_poly = np.array(poly).reshape((4,2))
+                    polygons.append(Polygon(np_poly))
+                    color.append(c)
+            p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
+            ax.add_collection(p)
+            p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
+            ax.add_collection(p)
+        elif datasetType == 'captions':
+            for ann in anns:
+                print(ann['caption'])
+    def loadRes(self, resFile):
+        """
+        Load result file and return a result api object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = COCO()
+        res.dataset['images'] = [img for img in self.dataset['images']]
+        print('Loading and preparing results...')
+        tic = time.time()
+        if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode):
+            anns = json.load(open(resFile))
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, 'results in not an array of objects'
+        annsImgIds = [ann['image_id'] for ann in anns]
+        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
+               'Results do not correspond to current coco set'
+        if 'caption' in anns[0]:
+            imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
+            res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
+            for id, ann in enumerate(anns):
+                ann['id'] = id+1
+        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                bb = ann['bbox']
+                x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
+                if not 'segmentation' in ann:
+                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+                ann['area'] = bb[2]*bb[3]
+                ann['id'] = id+1
+                ann['iscrowd'] = 0
+        elif 'segmentation' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                # now only support compressed RLE format as segmentation results
+                ann['area'] = maskUtils.area(ann['segmentation'])
+                if not 'bbox' in ann:
+                    ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
+                ann['id'] = id+1
+                ann['iscrowd'] = 0
+        elif 'keypoints' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                s = ann['keypoints']
+                x = s[0::3]
+                y = s[1::3]
+                x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
+                ann['area'] = (x1-x0)*(y1-y0)
+                ann['id'] = id + 1
+                ann['bbox'] = [x0,y0,x1-x0,y1-y0]
+        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
+    def download(self, tarDir = None, imgIds = [] ):
+        '''
+        Download COCO images from mscoco.org server.
+        :param tarDir (str): COCO results directory name
+               imgIds (list): images to be downloaded
+        :return:
+        '''
+        if tarDir is None:
+            print('Please specify target directory')
+            return -1
+        if len(imgIds) == 0:
+            imgs = self.imgs.values()
+        else:
+            imgs = self.loadImgs(imgIds)
+        N = len(imgs)
+        if not os.path.exists(tarDir):
+            os.makedirs(tarDir)
+        for i, img in enumerate(imgs):
+            tic = time.time()
+            fname = os.path.join(tarDir, img['file_name'])
+            if not os.path.exists(fname):
+                urlretrieve(img['coco_url'], fname)
+            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
+    def loadNumpyAnnotations(self, data):
+        """
+        Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
+        :param  data (numpy.ndarray)
+        :return: annotations (python nested list)
+        """
+        print('Converting ndarray to lists...')
+        assert(type(data) == np.ndarray)
+        print(data.shape)
+        assert(data.shape[1] == 7)
+        N = data.shape[0]
+        ann = []
+        for i in range(N):
+            if i % 1000000 == 0:
+                print('{}/{}'.format(i,N))
+            ann += [{
+                'image_id'  : int(data[i, 0]),
+                'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
+                'score' : data[i, 5],
+                'category_id': int(data[i, 6]),
+                }]
+        return ann
+    def annToRLE(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE to RLE.
+        :return: binary mask (numpy 2D array)
+        """
+        t = self.imgs[ann['image_id']]
+        h, w = t['height'], t['width']
+        segm = ann['segmentation']
+        if type(segm) == list:
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(segm, h, w)
+            rle = maskUtils.merge(rles)
+        elif type(segm['counts']) == list:
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(segm, h, w)
+        else:
+            # rle
+            rle = ann['segmentation']
+        return rle
+    def annToMask(self, ann):
+        """
+        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
+        :return: binary mask (numpy 2D array)
+        """
+        rle = self.annToRLE(ann)
+        m = maskUtils.decode(rle)
+        return m

MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/cocoeval.py ADDED Viewed

	@@ -0,0 +1,534 @@

+__author__ = 'tsungyi'
+import numpy as np
+import datetime
+import time
+from collections import defaultdict
+from . import mask as maskUtils
+import copy
+class COCOeval:
+    # Interface for evaluating detection on the Microsoft COCO dataset.
+    #
+    # The usage for CocoEval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
+        '''
+        Initialize CocoEval using coco APIs for gt and dt
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        '''
+        if not iouType:
+            print('iouType not specified. use default iouType segm')
+        self.cocoGt   = cocoGt              # ground truth COCO API
+        self.cocoDt   = cocoDt              # detections COCO API
+        self.evalImgs = defaultdict(list)   # per-image per-category evaluation results [KxAxI] elements
+        self.eval     = {}                  # accumulated evaluation results
+        self._gts = defaultdict(list)       # gt for evaluation
+        self._dts = defaultdict(list)       # dt for evaluation
+        self.params = Params(iouType=iouType) # parameters
+        self._paramsEval = {}               # parameters for evaluation
+        self.stats = []                     # result summarization
+        self.ious = {}                      # ious between all gts and dts
+        if not cocoGt is None:
+            self.params.imgIds = sorted(cocoGt.getImgIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann['segmentation'] = rle
+        p = self.params
+        if p.useCats:
+            gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+            dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+        else:
+            gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
+        self._gts = defaultdict(list)       # gt for evaluation
+        self._dts = defaultdict(list)       # dt for evaluation
+        for gt in gts:
+            self._gts[gt['image_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['image_id'], dt['category_id']].append(dt)
+        self.evalImgs = defaultdict(list)   # per-image per-category evaluation results
+        self.eval     = {}                  # accumulated evaluation results
+    def evaluate(self):
+        '''
+        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+        :return: None
+        '''
+        tic = time.time()
+        print('Running per image evaluation...')
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if not p.useSegm is None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params=p
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+        if p.iouType == 'segm' or p.iouType == 'bbox':
+            computeIoU = self.computeIoU
+        elif p.iouType == 'keypoints':
+            computeIoU = self.computeOks
+        self.ious = {(imgId, catId): computeIoU(imgId, catId) \
+                        for imgId in p.imgIds
+                        for catId in catIds}
+        evaluateImg = self.evaluateImg
+        maxDet = p.maxDets[-1]
+        self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet)
+                 for catId in catIds
+                 for areaRng in p.areaRng
+                 for imgId in p.imgIds
+             ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc-tic))
+    def computeIoU(self, imgId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId,catId]
+            dt = self._dts[imgId,catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
+        if len(gt) == 0 and len(dt) ==0:
+            return []
+        inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt=dt[0:p.maxDets[-1]]
+        if p.iouType == 'segm':
+            g = [g['segmentation'] for g in gt]
+            d = [d['segmentation'] for d in dt]
+        elif p.iouType == 'bbox':
+            g = [g['bbox'] for g in gt]
+            d = [d['bbox'] for d in dt]
+        else:
+            raise Exception('unknown iouType for iou computation')
+        # compute iou between each dt and gt region
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        ious = maskUtils.iou(d,g,iscrowd)
+        return ious
+    def computeOks(self, imgId, catId):
+        p = self.params
+        # dimention here should be Nxm
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0:p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = p.kpt_oks_sigmas
+        vars = (sigmas * 2)**2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt['keypoints'])
+            xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt['bbox']
+            x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt['keypoints'])
+                xd = d[0::3]; yd = d[1::3]
+                if k1>0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
+                    z = np.zeros((k))
+                    dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
+                    dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)
+                e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2
+                if k1 > 0:
+                    e=e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        '''
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        '''
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId,catId]
+            dt = self._dts[imgId,catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
+        if len(gt) == 0 and len(dt) ==0:
+            return None
+        for g in gt:
+            if g['ignore'] or (g['area']<aRng[0] or g['area']>aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId]
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm  = np.zeros((T,G))
+        dtm  = np.zeros((T,D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T,D))
+        if not len(ious)==0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t,1-1e-10])
+                    m   = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind,gind]>0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m>-1 and gtIg[m]==0 and gtIg[gind]==1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind,gind] < iou:
+                            continue
+                        # if match successful and best so far, store appropriately
+                        iou=ious[dind,gind]
+                        m=gind
+                    # if match made store id of match for both dt and gt
+                    if m ==-1:
+                        continue
+                    dtIg[tind,dind] = gtIg[m]
+                    dtm[tind,dind]  = gt[m]['id']
+                    gtm[tind,m]     = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d['area']<aRng[0] or d['area']>aRng[1] for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0)))
+        # store results for given image and category
+        return {
+                'image_id':     imgId,
+                'category_id':  catId,
+                'aRng':         aRng,
+                'maxDet':       maxDet,
+                'dtIds':        [d['id'] for d in dt],
+                'gtIds':        [g['id'] for g in gt],
+                'dtMatches':    dtm,
+                'gtMatches':    gtm,
+                'dtScores':     [d['score'] for d in dt],
+                'gtIgnore':     gtIg,
+                'dtIgnore':     dtIg,
+            }
+    def accumulate(self, p = None):
+        '''
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        '''
+        print('Accumulating evaluation results...')
+        tic = time.time()
+        if not self.evalImgs:
+            print('Please run evaluate() first')
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T           = len(p.iouThrs)
+        R           = len(p.recThrs)
+        K           = len(p.catIds) if p.useCats else 1
+        A           = len(p.areaRng)
+        M           = len(p.maxDets)
+        precision   = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
+        recall      = -np.ones((T,K,A,M))
+        scores      = -np.ones((T,R,K,A,M))
+        # create dictionary for future indexing
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.imgIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds)  if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        i_list = [n for n, i in enumerate(p.imgIds)  if i in setI]
+        I0 = len(_pe.imgIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0*A0*I0
+            for a, a0 in enumerate(a_list):
+                Na = a0*I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if not e is None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind='mergesort')
+                    dtScoresSorted = dtScores[inds]
+                    dtm  = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
+                    dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet]  for e in E], axis=1)[:,inds]
+                    gtIg = np.concatenate([e['gtIgnore'] for e in E])
+                    npig = np.count_nonzero(gtIg==0 )
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(               dtm,  np.logical_not(dtIg) )
+                    fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp+tp+np.spacing(1))
+                        q  = np.zeros((R,))
+                        ss = np.zeros((R,))
+                        if nd:
+                            recall[t,k,a,m] = rc[-1]
+                        else:
+                            recall[t,k,a,m] = 0
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist(); q = q.tolist()
+                        for i in range(nd-1, 0, -1):
+                            if pr[i] > pr[i-1]:
+                                pr[i-1] = pr[i]
+                        inds = np.searchsorted(rc, p.recThrs, side='left')
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                                ss[ri] = dtScoresSorted[pi]
+                        except:
+                            pass
+                        precision[t,:,k,a,m] = np.array(q)
+                        scores[t,:,k,a,m] = np.array(ss)
+        self.eval = {
+            'params': p,
+            'counts': [T, R, K, A, M],
+            'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            'precision': precision,
+            'recall':   recall,
+            'scores': scores,
+        }
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format( toc-tic))
+    def summarize(self):
+        '''
+        Compute and display summary metrics for evaluation results.
+        Note this functin can *only* be applied on the default parameter setting
+        '''
+        def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
+            p = self.params
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap==1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:,:,:,aind,mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:,:,aind,mind]
+            if len(s[s>-1])==0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s>-1])
+            print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
+            return mean_s
+        def _summarizeDets():
+            stats = np.zeros((12,))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
+            return stats
+        def _summarizeKps():
+            stats = np.zeros((10,))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
+    def __str__(self):
+        self.summarize()
+class Params:
+    '''
+    Params for coco evaluation api
+    '''
+    def setDetParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ['all', 'small', 'medium', 'large']
+        self.useCats = 1
+    def setKpParams(self):
+        self.imgIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
+        self.areaRngLbl = ['all', 'medium', 'large']
+        self.useCats = 1
+        self.kpt_oks_sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
+    def __init__(self, iouType='segm'):
+        if iouType == 'segm' or iouType == 'bbox':
+            self.setDetParams()
+        elif iouType == 'keypoints':
+            self.setKpParams()
+        else:
+            raise Exception('iouType not supported')
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None