drozdgk commited on
Commit
352cafd
·
1 Parent(s): daaac94

chore: vendor third_party (remove submodules, ignore artifacts)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +16 -0
  2. MaskClustering/third_party/Entity +0 -1
  3. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_baseline.yaml +40 -0
  4. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b0_1x.yaml +43 -0
  5. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b5_1x.yaml +43 -0
  6. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_1x.yaml +40 -0
  7. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_3x.yaml +40 -0
  8. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_dcnv2_3x.yaml +42 -0
  9. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_1x.yaml +40 -0
  10. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_3x.yaml +40 -0
  11. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_1x.yaml +51 -0
  12. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_3x.yaml +50 -0
  13. MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_t_1x.yaml +51 -0
  14. MaskClustering/third_party/Entity/Entity/EntitySeg/demo_result_and_vis.py +172 -0
  15. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/__init__.py +5 -0
  16. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/arch.py +298 -0
  17. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/__init__.py +2 -0
  18. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/mixvision.py +464 -0
  19. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/swin.py +723 -0
  20. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/config.py +102 -0
  21. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/__init__.py +0 -0
  22. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/detection.py +112 -0
  23. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/__init__.py +4 -0
  24. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/conv_with_kaiming_uniform.py +52 -0
  25. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/deform_conv.py +111 -0
  26. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/iou_loss.py +54 -0
  27. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/ml_nms.py +26 -0
  28. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/outputs.py +489 -0
  29. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/tower.py +100 -0
  30. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/__init__.py +2 -0
  31. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/comm.py +52 -0
  32. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/measures.py +191 -0
  33. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/__init__.py +0 -0
  34. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/entity_evaluation.py +523 -0
  35. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/__init__.py +2 -0
  36. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/dynamic_mask_head.py +303 -0
  37. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/mask_branch.py +71 -0
  38. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/utils.py +53 -0
  39. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/__init__.py +0 -0
  40. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/deformable_conv_with_off.py +59 -0
  41. MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/panopticfcn_head.py +190 -0
  42. MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/entity_to_json.py +123 -0
  43. MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.py +119 -0
  44. MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.sh +8 -0
  45. MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/Makefile +9 -0
  46. MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/__init__.py +1 -0
  47. MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.c +0 -0
  48. MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.pyx +308 -0
  49. MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/coco.py +453 -0
  50. MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/cocoeval.py +534 -0
.gitignore CHANGED
@@ -154,3 +154,19 @@ temp/
154
  **/*.bin
155
  data/
156
  **/*.pth
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  **/*.bin
155
  data/
156
  **/*.pth
157
+
158
+ # macOS junk
159
+ .DS_Store
160
+ **/.DS_Store
161
+
162
+ # Don't commit build artifacts / compiled binaries from third_party
163
+ MaskClustering/third_party/**/__pycache__/
164
+ MaskClustering/third_party/**/*.pyc
165
+ MaskClustering/third_party/**/*.pyo
166
+ MaskClustering/third_party/**/build/
167
+ MaskClustering/third_party/**/dist/
168
+ MaskClustering/third_party/**/*.o
169
+ MaskClustering/third_party/**/*.so
170
+
171
+ # HF Hub limit: keep large docs assets out of git
172
+ MaskClustering/third_party/Entity/Entityv2/figures/teaser_mosaic_low.png
MaskClustering/third_party/Entity DELETED
@@ -1 +0,0 @@
1
- Subproject commit 6e7e13ac91ef508088e1b848167c01f19b00b512
 
 
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_baseline.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_resnet_fpn_backbone"
6
+ RESNETS:
7
+ DEPTH: 50
8
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
9
+ WEIGHTS: "pretrained_model/R-50.pkl"
10
+ FPN:
11
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
12
+ FCOS:
13
+ NUM_CLASSES: 1
14
+ CONDINST:
15
+ CLASS_AGNOSTIC: True
16
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
17
+ MASK_BRANCH:
18
+ SEMANTIC_LOSS_ON: False
19
+ IN_FEATURES: ["p3", "p4", "p5"]
20
+ MASK_HEAD:
21
+ CLUSTER_WEIGHT: 0.0
22
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
23
+ DYNAMIC_WEIGHT: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
24
+ DATASETS:
25
+ TRAIN: ("coco_2017_train_entity",)
26
+ TEST: ("coco_2017_val_entity",)
27
+ SOLVER:
28
+ WARMUP_ITERS: 1500
29
+ IMS_PER_BATCH: 16
30
+ BASE_LR: 0.01
31
+ STEPS: (60000, 80000)
32
+ MAX_ITER: 90000
33
+ CHECKPOINT_PERIOD: 10000
34
+ DATALOADER:
35
+ FILTER_EMPTY_ANNOTATIONS: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
38
+ VERSION: 2
39
+ TEST:
40
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b0_1x.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_mit_fpn_backbone"
6
+ FREEZE_AT: -1
7
+ MIT_BACKBONE:
8
+ NAME: "b0"
9
+ WEIGHTS: "pretrained_model/mit_b0_trans.pth"
10
+ FPN:
11
+ IN_FEATURES: ["mit1", "mit2", "mit3", "mit4"]
12
+ TOP_LEVELS: 2
13
+ FCOS:
14
+ NUM_CLASSES: 1
15
+ CONDINST:
16
+ CLASS_AGNOSTIC: True
17
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
18
+ MASK_BRANCH:
19
+ SEMANTIC_LOSS_ON: False
20
+ IN_FEATURES: ["p3", "p4", "p5"]
21
+ MASK_HEAD:
22
+ CLUSTER_WEIGHT: 0.5
23
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
24
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
25
+ DATASETS:
26
+ TRAIN: ("coco_2017_train_entity",)
27
+ TEST: ("coco_2017_val_entity",)
28
+ SOLVER:
29
+ OPTIMIZER: "adamw"
30
+ WARMUP_ITERS: 1500
31
+ IMS_PER_BATCH: 16
32
+ BASE_LR: 0.0001
33
+ WEIGHT_DECAY: 0.05
34
+ STEPS: (60000, 80000)
35
+ MAX_ITER: 90000
36
+ CHECKPOINT_PERIOD: 20000
37
+ DATALOADER:
38
+ FILTER_EMPTY_ANNOTATIONS: True
39
+ INPUT:
40
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
41
+ VERSION: 2
42
+ TEST:
43
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_mit_b5_1x.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_mit_fpn_backbone"
6
+ FREEZE_AT: -1
7
+ MIT_BACKBONE:
8
+ NAME: "b5"
9
+ WEIGHTS: "pretrained_model/mit_b5_trans.pth"
10
+ FPN:
11
+ IN_FEATURES: ["mit1", "mit2", "mit3", "mit4"]
12
+ TOP_LEVELS: 2
13
+ FCOS:
14
+ NUM_CLASSES: 1
15
+ CONDINST:
16
+ CLASS_AGNOSTIC: True
17
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 80
18
+ MASK_BRANCH:
19
+ SEMANTIC_LOSS_ON: False
20
+ IN_FEATURES: ["p3", "p4", "p5"]
21
+ MASK_HEAD:
22
+ CLUSTER_WEIGHT: 0.5
23
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
24
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
25
+ DATASETS:
26
+ TRAIN: ("coco_2017_train_entity",)
27
+ TEST: ("coco_2017_val_entity",)
28
+ SOLVER:
29
+ OPTIMIZER: "adamw"
30
+ WARMUP_ITERS: 1500
31
+ IMS_PER_BATCH: 8
32
+ BASE_LR: 0.0001
33
+ WEIGHT_DECAY: 0.05
34
+ STEPS: (120000, 160000)
35
+ MAX_ITER: 180000
36
+ CHECKPOINT_PERIOD: 20000
37
+ DATALOADER:
38
+ FILTER_EMPTY_ANNOTATIONS: True
39
+ INPUT:
40
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
41
+ VERSION: 2
42
+ TEST:
43
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_1x.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_resnet_fpn_backbone"
6
+ RESNETS:
7
+ DEPTH: 101
8
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
9
+ WEIGHTS: "pretrained_model/R-101.pkl"
10
+ FPN:
11
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
12
+ FCOS:
13
+ NUM_CLASSES: 1
14
+ CONDINST:
15
+ CLASS_AGNOSTIC: True
16
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
17
+ MASK_BRANCH:
18
+ SEMANTIC_LOSS_ON: False
19
+ IN_FEATURES: ["p3", "p4", "p5"]
20
+ MASK_HEAD:
21
+ CLUSTER_WEIGHT: 0.5
22
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
23
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
24
+ DATASETS:
25
+ TRAIN: ("coco_2017_train_entity",)
26
+ TEST: ("coco_2017_val_entity",)
27
+ SOLVER:
28
+ WARMUP_ITERS: 1500
29
+ IMS_PER_BATCH: 16
30
+ BASE_LR: 0.01
31
+ STEPS: (60000, 80000)
32
+ MAX_ITER: 90000
33
+ CHECKPOINT_PERIOD: 20000
34
+ DATALOADER:
35
+ FILTER_EMPTY_ANNOTATIONS: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
38
+ VERSION: 2
39
+ TEST:
40
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_3x.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_resnet_fpn_backbone"
6
+ RESNETS:
7
+ DEPTH: 101
8
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
9
+ WEIGHTS: "pretrained_model/R-101.pkl"
10
+ FPN:
11
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
12
+ FCOS:
13
+ NUM_CLASSES: 1
14
+ CONDINST:
15
+ CLASS_AGNOSTIC: True
16
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
17
+ MASK_BRANCH:
18
+ SEMANTIC_LOSS_ON: False
19
+ IN_FEATURES: ["p3", "p4", "p5"]
20
+ MASK_HEAD:
21
+ CLUSTER_WEIGHT: 0.5
22
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
23
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
24
+ DATASETS:
25
+ TRAIN: ("coco_2017_train_entity",)
26
+ TEST: ("coco_2017_val_entity",)
27
+ SOLVER:
28
+ WARMUP_ITERS: 1500
29
+ IMS_PER_BATCH: 16
30
+ BASE_LR: 0.01
31
+ STEPS: (180000, 250000)
32
+ MAX_ITER: 270000
33
+ CHECKPOINT_PERIOD: 40000
34
+ DATALOADER:
35
+ FILTER_EMPTY_ANNOTATIONS: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
38
+ VERSION: 2
39
+ TEST:
40
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r101_dcnv2_3x.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_resnet_fpn_backbone"
6
+ RESNETS:
7
+ DEPTH: 101
8
+ DEFORM_ON_PER_STAGE: [False, True, True, True]
9
+ DEFORM_MODULATED: True
10
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
11
+ WEIGHTS: "pretrained_model/R-101.pkl"
12
+ FPN:
13
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ FCOS:
15
+ NUM_CLASSES: 1
16
+ CONDINST:
17
+ CLASS_AGNOSTIC: True
18
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
19
+ MASK_BRANCH:
20
+ SEMANTIC_LOSS_ON: False
21
+ IN_FEATURES: ["p3", "p4", "p5"]
22
+ MASK_HEAD:
23
+ CLUSTER_WEIGHT: 0.5
24
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
25
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
26
+ DATASETS:
27
+ TRAIN: ("coco_2017_train_entity",)
28
+ TEST: ("coco_2017_val_entity",)
29
+ SOLVER:
30
+ WARMUP_ITERS: 1500
31
+ IMS_PER_BATCH: 16
32
+ BASE_LR: 0.01
33
+ STEPS: (180000, 250000)
34
+ MAX_ITER: 270000
35
+ CHECKPOINT_PERIOD: 40000
36
+ DATALOADER:
37
+ FILTER_EMPTY_ANNOTATIONS: True
38
+ INPUT:
39
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
40
+ VERSION: 2
41
+ TEST:
42
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_1x.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_resnet_fpn_backbone"
6
+ RESNETS:
7
+ DEPTH: 50
8
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
9
+ WEIGHTS: "pretrained_model/R-50.pkl"
10
+ FPN:
11
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
12
+ FCOS:
13
+ NUM_CLASSES: 1
14
+ CONDINST:
15
+ CLASS_AGNOSTIC: True
16
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
17
+ MASK_BRANCH:
18
+ SEMANTIC_LOSS_ON: False
19
+ IN_FEATURES: ["p3", "p4", "p5"]
20
+ MASK_HEAD:
21
+ CLUSTER_WEIGHT: 0.5
22
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
23
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
24
+ DATASETS:
25
+ TRAIN: ("coco_2017_train_entity",)
26
+ TEST: ("coco_2017_val_entity",)
27
+ SOLVER:
28
+ WARMUP_ITERS: 1500
29
+ IMS_PER_BATCH: 16
30
+ BASE_LR: 0.01
31
+ STEPS: (60000, 80000)
32
+ MAX_ITER: 90000
33
+ CHECKPOINT_PERIOD: 40000
34
+ DATALOADER:
35
+ FILTER_EMPTY_ANNOTATIONS: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
38
+ VERSION: 2
39
+ TEST:
40
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_r50_3x.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_resnet_fpn_backbone"
6
+ RESNETS:
7
+ DEPTH: 50
8
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
9
+ WEIGHTS: "pretrained_model/R-50.pkl"
10
+ FPN:
11
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
12
+ FCOS:
13
+ NUM_CLASSES: 1
14
+ CONDINST:
15
+ CLASS_AGNOSTIC: True
16
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
17
+ MASK_BRANCH:
18
+ SEMANTIC_LOSS_ON: False
19
+ IN_FEATURES: ["p3", "p4", "p5"]
20
+ MASK_HEAD:
21
+ CLUSTER_WEIGHT: 0.5
22
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
23
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
24
+ DATASETS:
25
+ TRAIN: ("coco_2017_train_entity",)
26
+ TEST: ("coco_2017_val_entity",)
27
+ SOLVER:
28
+ WARMUP_ITERS: 1500
29
+ IMS_PER_BATCH: 16
30
+ BASE_LR: 0.01
31
+ STEPS: (180000, 250000)
32
+ MAX_ITER: 270000
33
+ CHECKPOINT_PERIOD: 40000
34
+ DATALOADER:
35
+ FILTER_EMPTY_ANNOTATIONS: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
38
+ VERSION: 2
39
+ TEST:
40
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_1x.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_swin_fpn_backbone"
6
+ FREEZE_AT: -1
7
+ SWINT:
8
+ EMBED_DIM: 192
9
+ PATCH_SIZE: 4
10
+ OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
11
+ DEPTHS: [2, 2, 18, 2]
12
+ NUM_HEADS: [6, 12, 24, 48]
13
+ WINDOW_SIZE: 7
14
+ MLP_RATIO: 4
15
+ DROP_PATH_RATE: 0.2
16
+ APE: False
17
+ WEIGHTS: "pretrained_model/swin_large_patch4_window7_224_22k_trans.pth"
18
+ FPN:
19
+ IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
20
+ TOP_LEVELS: 2
21
+ FCOS:
22
+ NUM_CLASSES: 1
23
+ CONDINST:
24
+ CLASS_AGNOSTIC: True
25
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
26
+ MASK_BRANCH:
27
+ SEMANTIC_LOSS_ON: False
28
+ IN_FEATURES: ["p3", "p4", "p5"]
29
+ MASK_HEAD:
30
+ CLUSTER_WEIGHT: 0.5
31
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
32
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
33
+ DATASETS:
34
+ TRAIN: ("coco_2017_train_entity",)
35
+ TEST: ("coco_2017_val_entity",)
36
+ SOLVER:
37
+ OPTIMIZER: "adamw"
38
+ WARMUP_ITERS: 1500
39
+ IMS_PER_BATCH: 16
40
+ BASE_LR: 0.0001
41
+ WEIGHT_DECAY: 0.05
42
+ STEPS: (60000, 80000)
43
+ MAX_ITER: 90000
44
+ CHECKPOINT_PERIOD: 20000
45
+ DATALOADER:
46
+ FILTER_EMPTY_ANNOTATIONS: True
47
+ INPUT:
48
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
49
+ VERSION: 2
50
+ TEST:
51
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_lw7_3x.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_swin_fpn_backbone"
6
+ FREEZE_AT: -1
7
+ SWINT:
8
+ EMBED_DIM: 192
9
+ PATCH_SIZE: 4
10
+ OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
11
+ DEPTHS: [2, 2, 18, 2]
12
+ NUM_HEADS: [6, 12, 24, 48]
13
+ WINDOW_SIZE: 7
14
+ MLP_RATIO: 4
15
+ DROP_PATH_RATE: 0.2
16
+ APE: False
17
+ WEIGHTS: "pretrained_model/swin_large_patch4_window7_224_22k_trans.pth"
18
+ FPN:
19
+ IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
20
+ TOP_LEVELS: 2
21
+ FCOS:
22
+ NUM_CLASSES: 1
23
+ CONDINST:
24
+ CLASS_AGNOSTIC: True
25
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
26
+ MASK_BRANCH:
27
+ SEMANTIC_LOSS_ON: False
28
+ IN_FEATURES: ["p3", "p4", "p5"]
29
+ MASK_HEAD:
30
+ CLUSTER_WEIGHT: 0.5
31
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
32
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
33
+ DATASETS:
34
+ TRAIN: ("coco_2017_train_entity",)
35
+ TEST: ("coco_2017_val_entity",)
36
+ SOLVER:
37
+ WARMUP_ITERS: 1500
38
+ IMS_PER_BATCH: 16
39
+ BASE_LR: 0.0001
40
+ WEIGHT_DECAY: 0.05
41
+ STEPS: (180000, 250000)
42
+ MAX_ITER: 270000
43
+ CHECKPOINT_PERIOD: 40000
44
+ DATALOADER:
45
+ FILTER_EMPTY_ANNOTATIONS: True
46
+ INPUT:
47
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
48
+ VERSION: 2
49
+ TEST:
50
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/configs/entity_swin_t_1x.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "EntityFPN"
3
+ MASK_ON: False
4
+ BACKBONE:
5
+ NAME: "build_retinanet_swin_fpn_backbone"
6
+ FREEZE_AT: -1
7
+ SWINT:
8
+ EMBED_DIM: 96
9
+ PATCH_SIZE: 4
10
+ OUT_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
11
+ DEPTHS: [2, 2, 6, 2]
12
+ NUM_HEADS: [3, 6, 12, 24]
13
+ WINDOW_SIZE: 7
14
+ MLP_RATIO: 4
15
+ DROP_PATH_RATE: 0.2
16
+ APE: False
17
+ WEIGHTS: "pretrained_model/swin_tiny_patch4_window7_224_trans.pth"
18
+ FPN:
19
+ IN_FEATURES: ["stage2", "stage3", "stage4", "stage5"]
20
+ TOP_LEVELS: 2
21
+ FCOS:
22
+ NUM_CLASSES: 1
23
+ CONDINST:
24
+ CLASS_AGNOSTIC: True
25
+ TRAIN_MAX_PROPOSALS_PER_IMAGE: 120
26
+ MASK_BRANCH:
27
+ SEMANTIC_LOSS_ON: False
28
+ IN_FEATURES: ["p3", "p4", "p5"]
29
+ MASK_HEAD:
30
+ CLUSTER_WEIGHT: 0.5
31
+ DYNAMIC: ["111", "110", "101", "100", "011", "010", "001"]
32
+ DYNAMIC_WEIGHT: [1.0, 1.0, 1.0, 1.0, 0.25, 0.25, 0.25]
33
+ DATASETS:
34
+ TRAIN: ("coco_2017_train_entity",)
35
+ TEST: ("coco_2017_val_entity",)
36
+ SOLVER:
37
+ OPTIMIZER: "adamw"
38
+ WARMUP_ITERS: 1500
39
+ IMS_PER_BATCH: 16
40
+ BASE_LR: 0.0001
41
+ WEIGHT_DECAY: 0.05
42
+ STEPS: (60000, 80000)
43
+ MAX_ITER: 90000
44
+ CHECKPOINT_PERIOD: 20000
45
+ DATALOADER:
46
+ FILTER_EMPTY_ANNOTATIONS: True
47
+ INPUT:
48
+ MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
49
+ VERSION: 2
50
+ TEST:
51
+ CLASS_AGNOSTIC: True
MaskClustering/third_party/Entity/Entity/EntitySeg/demo_result_and_vis.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ import argparse
3
+ import glob
4
+ import multiprocessing as mp
5
+ import os
6
+ import time
7
+ import cv2
8
+ import tqdm
9
+ import numpy as np
10
+ import copy
11
+
12
+ from detectron2.config import get_cfg
13
+ from detectron2.data.detection_utils import read_image
14
+ from detectron2.utils.logger import setup_logger
15
+ from detectron2.engine import default_setup
16
+
17
+ from entityseg import *
18
+
19
+ from predictor import VisualizationDemo
20
+ import pdb
21
+
22
+ # constants
23
+ WINDOW_NAME = "Image Segmentation"
24
+
25
+ def make_colors():
26
+ from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
27
+ colors = []
28
+ for cate in COCO_CATEGORIES:
29
+ colors.append(cate["color"])
30
+ return colors
31
+
32
+ def mask_to_boundary(mask, dilation_ratio=0.0008):
33
+ """
34
+ Convert binary mask to boundary mask.
35
+ :param mask (numpy array, uint8): binary mask
36
+ :param dilation_ratio (float): ratio to calculate dilation = dilation_ratio * image_diagonal
37
+ :return: boundary mask (numpy array)
38
+ """
39
+ h, w = mask.shape
40
+ img_diag = np.sqrt(h ** 2 + w ** 2)
41
+ dilation = int(round(dilation_ratio * img_diag))
42
+ if dilation < 1:
43
+ dilation = 1
44
+ # Pad image so mask truncated by the image border is also considered as boundary.
45
+ new_mask = cv2.copyMakeBorder(mask, 1, 1, 1, 1, cv2.BORDER_CONSTANT, value=0)
46
+ kernel = np.ones((3, 3), dtype=np.uint8)
47
+ new_mask_erode = cv2.erode(new_mask, kernel, iterations=dilation)
48
+ mask_erode = new_mask_erode[1 : h + 1, 1 : w + 1]
49
+ # G_d intersects G in the paper.
50
+ return mask - mask_erode
51
+
52
+
53
+ def setup_cfg(args):
54
+ # load config from file and command-line arguments
55
+ cfg = get_cfg()
56
+ add_entity_config(cfg)
57
+ cfg.merge_from_file(args.config_file)
58
+ cfg.merge_from_list(args.opts)
59
+ default_setup(cfg, args)
60
+ cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold
61
+ cfg.freeze()
62
+ return cfg
63
+
64
+
65
+ def get_parser():
66
+ parser = argparse.ArgumentParser(description="Detectron2 demo for builtin models")
67
+ parser.add_argument(
68
+ "--config-file",
69
+ default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
70
+ metavar="FILE",
71
+ help="path to config file",
72
+ )
73
+ parser.add_argument(
74
+ "--input",
75
+ nargs="+",
76
+ help="A list of space separated input images; "
77
+ "or a single glob pattern such as 'directory/*.jpg'",
78
+ )
79
+ parser.add_argument(
80
+ "--output",
81
+ help="A file or directory to save output visualizations. "
82
+ "If not given, will show output in an OpenCV window.",
83
+ )
84
+
85
+ parser.add_argument(
86
+ "--confidence-threshold",
87
+ type=float,
88
+ default=0.2,
89
+ help="Minimum score for instance predictions to be shown",
90
+ )
91
+
92
+ parser.add_argument(
93
+ "opts",
94
+ help="Modify config options by adding 'KEY VALUE' pairs at the end of the command. "
95
+ "See config references at "
96
+ "https://detectron2.readthedocs.io/modules/config.html#config-references",
97
+ default=None,
98
+ nargs=argparse.REMAINDER,
99
+ )
100
+ return parser
101
+
102
+
103
+ if __name__ == "__main__":
104
+ mp.set_start_method("spawn", force=True)
105
+ args = get_parser().parse_args()
106
+ setup_logger(name="fvcore")
107
+ logger = setup_logger()
108
+ logger.info("Arguments: " + str(args))
109
+
110
+ if not os.path.exists(args.output):
111
+ os.makedirs(args.output)
112
+
113
+ cfg = setup_cfg(args)
114
+
115
+ demo = VisualizationDemo(cfg)
116
+ colors = make_colors()
117
+
118
+ if args.input:
119
+ if len(args.input) == 1:
120
+ args.input = glob.glob(os.path.expanduser(args.input[0]))
121
+ assert args.input, "The input path(s) was not found"
122
+ for path in tqdm.tqdm(args.input, disable=not args.output):
123
+ # use PIL, to be consistent with evaluation
124
+ img = read_image(path, format="BGR")
125
+ start_time = time.time()
126
+ data = demo.run_on_image_wo_vis(img)
127
+ logger.info(
128
+ "{}: {} in {:.2f}s".format(
129
+ path,
130
+ "detected {} instances".format(len(data[0])),
131
+ time.time() - start_time,
132
+ )
133
+ )
134
+
135
+ if os.path.isdir(args.output):
136
+ assert os.path.isdir(args.output), args.output
137
+ out_filename = os.path.join(args.output, os.path.basename(path))
138
+ else:
139
+ assert len(args.input) == 1, "Please specify a directory with args.output"
140
+ out_filename = args.output
141
+ ## save inference result, [0] original score by detection head, [1] mask rescoring score, [2] mask_id
142
+ ori_scores = data[0]
143
+ scores = data[1]
144
+ mask_id = data[2]
145
+ np.savez(out_filename.split(".")[0]+".npz", ori_scores=ori_scores, scores=scores, mask_id=mask_id)
146
+
147
+ ## save visualization
148
+ img_for_paste = copy.deepcopy(img)
149
+ color_mask = copy.deepcopy(img)
150
+ masks_edge = np.zeros(img.shape[:2], dtype=np.uint8)
151
+ alpha = 0.4
152
+ count = 0
153
+ for index, score in enumerate(scores):
154
+ if score <= args.confidence_threshold:
155
+ break
156
+ color_mask[mask_id==count] = colors[count]
157
+ boundary = mask_to_boundary((mask_id==count).astype(np.uint8))
158
+ masks_edge[boundary>0] = 1
159
+ count += 1
160
+ img_wm = cv2.addWeighted(img_for_paste, alpha, color_mask, 1-alpha, 0)
161
+ img_wm[masks_edge==1] = 0
162
+ fvis = np.concatenate((img, img_wm))
163
+ cv2.imwrite(out_filename.split(".")[0]+".jpg",fvis)
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .arch import EntityFPN
2
+ from .data import *
3
+ from .config import add_entity_config
4
+ from .evaluator.entity_evaluation import COCOEvaluator_ClassAgnostic
5
+ from .backbone import build_retinanet_swin_fpn_backbone, build_retinanet_mit_fpn_backbone
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/arch.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import logging
3
+ import torch
4
+ from torch import nn
5
+ import torch.nn.functional as F
6
+
7
+ from detectron2.structures import ImageList
8
+ from detectron2.modeling.backbone import build_backbone
9
+ from detectron2.modeling.postprocessing import detector_postprocess, sem_seg_postprocess
10
+ from detectron2.modeling.proposal_generator import build_proposal_generator
11
+ from detectron2.modeling.roi_heads import build_roi_heads
12
+ from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
13
+
14
+ from .det_head.detection import build_det_head
15
+ from .det_head.utils.comm import aligned_bilinear
16
+
17
+ from .mask_head.dynamic_mask_head import build_dynamic_mask_head
18
+ from .mask_head.mask_branch import build_mask_branch
19
+
20
+ from .panopticfcn_tools.panopticfcn_head import build_kernel_head
21
+
22
+ from detectron2.structures import Instances, Boxes
23
+ import random
24
+ import pdb
25
+ import copy
26
+ logger = logging.getLogger(__name__)
27
+
28
+ __all__ = ["ItemFPN"]
29
+ @META_ARCH_REGISTRY.register()
30
+ class EntityFPN(nn.Module):
31
+ """
32
+ Implement the paper :paper:`PanopticFPN`.
33
+ """
34
+
35
+ def __init__(self, cfg):
36
+ super().__init__()
37
+ self.device = torch.device(cfg.MODEL.DEVICE)
38
+
39
+ self.backbone = build_backbone(cfg)
40
+ backbone_shape = self.backbone.output_shape()
41
+ self.det_head = build_det_head(cfg, backbone_shape)
42
+
43
+ ## mask
44
+ self.mask_head = build_dynamic_mask_head(cfg)
45
+ self.mask_branch = build_mask_branch(cfg, self.backbone.output_shape())
46
+ self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
47
+ self.max_proposals = cfg.MODEL.CONDINST.MAX_PROPOSALS
48
+ self.only_class_agnostic = cfg.MODEL.CONDINST.CLASS_AGNOSTIC
49
+
50
+ in_channels = self.det_head.in_channels_to_top_module
51
+
52
+ self.controller = build_kernel_head(cfg, self.mask_head.num_gen_params)
53
+ self.train_max_proposals_per_image = cfg.MODEL.CONDINST.TRAIN_MAX_PROPOSALS_PER_IMAGE
54
+
55
+ self.use_mask_rescore_infer = cfg.MODEL.CONDINST.MASK_BRANCH.USE_MASK_RESCORE
56
+
57
+ pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
58
+ pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
59
+ self.normalizer = lambda x: (x - pixel_mean) / pixel_std
60
+
61
+ self.pixel_mean = pixel_mean
62
+ self.pixel_std = pixel_std
63
+ self.to(self.device)
64
+
65
+ def forward(self, batched_inputs):
66
+ """
67
+ Args:
68
+ batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
69
+ Each item in the list contains the inputs for one image.
70
+
71
+ For now, each item in the list is a dict that contains:
72
+
73
+ * "image": Tensor, image in (C, H, W) format.
74
+ * "instances": Instances
75
+ * "sem_seg": semantic segmentation ground truth.
76
+ * Other information that's included in the original dicts, such as:
77
+ "height", "width" (int): the output resolution of the model, used in inference.
78
+ See :meth:`postprocess` for details.
79
+
80
+ Returns:
81
+ list[dict]:
82
+ each dict is the results for one image. The dict contains the following keys:
83
+
84
+ * "instances": see :meth:`GeneralizedRCNN.forward` for its format.
85
+ * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format.
86
+ * "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`.
87
+ See the return value of
88
+ :func:`combine_semantic_and_instance_outputs` for its format.
89
+ """
90
+
91
+ # for x in batched_inputs:
92
+ # print(x["file_name"])
93
+ images = self.preprocess_image(batched_inputs)
94
+ features = self.backbone(images.tensor)
95
+
96
+ if "instances" in batched_inputs[0] and self.training:
97
+ B = len(batched_inputs)
98
+ for i in range(B):
99
+ if self.only_class_agnostic:
100
+ batched_inputs[i]["instances"].gt_classes[:] = 0
101
+
102
+ instance_map = batched_inputs[i]["instance_map"]
103
+ num_instances = int(torch.max(instance_map)+1)
104
+ instanceid = batched_inputs[i]["instances"].instanceid
105
+ gt_bitmasks_pad = F.one_hot(instance_map.long(), num_instances)[...,instanceid].permute((2,0,1))
106
+
107
+ pad_h, pad_w = images.tensor.size(-2), images.tensor.size(-1)
108
+ no_pad_h, no_pad_w = gt_bitmasks_pad.shape[1:]
109
+
110
+ padding_size = [0, pad_w - no_pad_w, 0, pad_h-no_pad_h]
111
+ gt_bitmasks_pad = F.pad(gt_bitmasks_pad, padding_size, value=0)
112
+
113
+ start = int(self.mask_out_stride // 2)
114
+ bitmask_full = gt_bitmasks_pad.clone()
115
+ bitmask = gt_bitmasks_pad[:,start::self.mask_out_stride, start::self.mask_out_stride]
116
+
117
+ N = bitmask.shape[0]
118
+ batched_inputs[i]["instances"].gt_bitmasks = bitmask.int()
119
+ batched_inputs[i]["instances"].gt_bitmasks_full = bitmask_full.int()
120
+
121
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
122
+ else:
123
+ gt_instances = None
124
+
125
+ mask_feats = self.mask_branch(features, gt_instances)
126
+ proposals, proposal_losses = self.det_head(images, features, gt_instances, self.controller)
127
+
128
+ if self.training:
129
+ max_num_proposals = self.train_max_proposals_per_image * len(batched_inputs)
130
+ actual_num_proposals = len(proposals["instances"])
131
+ if actual_num_proposals >= max_num_proposals:
132
+ select = random.sample(list(range(actual_num_proposals)), max_num_proposals)
133
+ proposals["instances"] = proposals["instances"][select]
134
+
135
+ loss_masks = self._forward_mask_heads_train(proposals, mask_feats, gt_instances)
136
+ losses = {}
137
+ losses.update(proposal_losses)
138
+ losses.update(loss_masks)
139
+ return losses
140
+ else:
141
+ pred_instances_w_masks = self._forward_mask_heads_test(proposals, mask_feats)
142
+ padded_im_h, padded_im_w = images.tensor.size()[-2:]
143
+ processed_results = []
144
+ for im_id, (input_per_image, image_size) in enumerate(zip(batched_inputs, images.image_sizes)):
145
+ height = input_per_image.get("height", image_size[0])
146
+ width = input_per_image.get("width", image_size[1])
147
+
148
+ instances_per_im = pred_instances_w_masks[pred_instances_w_masks.im_inds == im_id]
149
+ instances_per_im = self.postprocess(
150
+ instances_per_im, height, width,
151
+ padded_im_h, padded_im_w
152
+ )
153
+
154
+ processed_results.append({
155
+ "instances": instances_per_im
156
+ })
157
+
158
+ return processed_results
159
+
160
+ def _forward_mask_heads_train(self, proposals, mask_feats, gt_instances):
161
+ # prepare the inputs for mask heads
162
+ pred_instances = proposals["instances"]
163
+
164
+ if 0 <= self.max_proposals < len(pred_instances):
165
+ inds = torch.randperm(len(pred_instances), device=mask_feats.device).long()
166
+ logger.info("clipping proposals from {} to {}".format(
167
+ len(pred_instances), self.max_proposals
168
+ ))
169
+ pred_instances = pred_instances[inds[:self.max_proposals]]
170
+
171
+ pred_instances.mask_head_params = pred_instances.top_feats
172
+
173
+ loss_masks = self.mask_head(
174
+ mask_feats, self.mask_branch.out_stride,
175
+ pred_instances, gt_instances
176
+ )
177
+ return loss_masks
178
+
179
+ def _forward_mask_heads_test(self, proposals, mask_feats):
180
+ # prepare the inputs for mask heads
181
+ for im_id, per_im in enumerate(proposals):
182
+ per_im.im_inds = per_im.locations.new_ones(len(per_im), dtype=torch.long) * im_id
183
+ pred_instances = Instances.cat(proposals)
184
+ pred_instances.mask_head_params = pred_instances.top_feat
185
+
186
+ pred_instances_w_masks = self.mask_head(mask_feats, self.mask_branch.out_stride, pred_instances)
187
+
188
+ return pred_instances_w_masks
189
+
190
+ def preprocess_image(self, batched_inputs):
191
+ """
192
+ Normalize, pad and batch the input images.
193
+ """
194
+ images = [x["image"].to(self.device) for x in batched_inputs]
195
+ images = [self.normalizer(x) for x in images]
196
+ images = ImageList.from_tensors(images, self.backbone.size_divisibility)
197
+ return images
198
+
199
+ def postprocess(self, results, output_height, output_width, padded_im_h, padded_im_w, mask_threshold=0.5):
200
+ """
201
+ Resize the output instances.
202
+ The input images are often resized when entering an object detector.
203
+ As a result, we often need the outputs of the detector in a different
204
+ resolution from its inputs.
205
+ This function will resize the raw outputs of an R-CNN detector
206
+ to produce outputs according to the desired output resolution.
207
+ Args:
208
+ results (Instances): the raw outputs from the detector.
209
+ `results.image_size` contains the input image resolution the detector sees.
210
+ This object might be modified in-place.
211
+ output_height, output_width: the desired output resolution.
212
+ Returns:
213
+ Instances: the resized output from the model, based on the output resolution
214
+ """
215
+ scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0])
216
+ resized_im_h, resized_im_w = results.image_size
217
+ results = Instances((output_height, output_width), **results.get_fields())
218
+
219
+ if results.has("pred_boxes"):
220
+ output_boxes = results.pred_boxes
221
+ elif results.has("proposal_boxes"):
222
+ output_boxes = results.proposal_boxes
223
+
224
+ output_boxes.scale(scale_x, scale_y)
225
+ output_boxes.clip(results.image_size)
226
+ results = results[output_boxes.nonempty()]
227
+
228
+ if results.has("pred_global_masks"):
229
+ mask_h, mask_w = results.pred_global_masks.size()[-2:]
230
+ factor_h = padded_im_h // mask_h
231
+ factor_w = padded_im_w // mask_w
232
+ assert factor_h == factor_w
233
+ factor = factor_h
234
+ pred_global_masks = aligned_bilinear(
235
+ results.pred_global_masks, factor
236
+ )
237
+ pred_global_masks = pred_global_masks[:, :, :resized_im_h, :resized_im_w]
238
+ pred_global_masks = F.interpolate(
239
+ pred_global_masks,
240
+ size=(output_height, output_width),
241
+ mode="bilinear", align_corners=False
242
+ )
243
+ pred_global_masks = pred_global_masks[:, 0, :, :]
244
+ results.pred_masks = (pred_global_masks > mask_threshold).float()
245
+ results.pred_masks_score = pred_global_masks
246
+
247
+ # from high score to low score
248
+ origin_masks = results.pred_masks
249
+ num_instances, H, W = origin_masks.shape
250
+ filter_masks = []
251
+
252
+ # initialize background
253
+ mask_0 = torch.zeros((H, W)).cuda() + 0.001
254
+ filter_masks.insert(0, mask_0)
255
+ score = 0.002
256
+ for index in range(num_instances):
257
+ mask = origin_masks[num_instances-index-1]
258
+ mask[mask==1] = score
259
+ filter_masks.insert(0, mask)
260
+ score = score + 0.001
261
+
262
+ filter_masks = torch.stack(filter_masks, dim=0)
263
+ _, instance_ids = torch.max(filter_masks, dim=0)
264
+ unique_instance_ids = torch.unique(instance_ids)
265
+
266
+ ori_scores = results.scores.clone()
267
+ has_mask_valid = []
268
+ for instance_id in unique_instance_ids:
269
+ if instance_id == num_instances:
270
+ continue
271
+ mask = (instance_ids==instance_id).float()
272
+ finds_y, finds_x = torch.nonzero(mask==1, as_tuple=True)
273
+ if len(finds_y) == 0:
274
+ continue
275
+ x1 = torch.min(finds_x)
276
+ x2 = torch.max(finds_x)
277
+ y1 = torch.min(finds_y)
278
+ y2 = torch.max(finds_y)
279
+
280
+ if x2-x1==0 or y2-y1==0:
281
+ continue
282
+ has_mask_valid.append(int(instance_id))
283
+
284
+ ## mask rescoring would obtain higher performance
285
+ if self.use_mask_rescore_infer:
286
+ mask_score = results.pred_masks_score[instance_id]
287
+ seg_scores = (mask_score * mask).sum() / mask.sum()
288
+ results.scores[instance_id] = results.scores[instance_id] * seg_scores
289
+
290
+ results.pred_masks[instance_id] = mask
291
+ results.pred_boxes.tensor[instance_id][0] = x1
292
+ results.pred_boxes.tensor[instance_id][1] = y1
293
+ results.pred_boxes.tensor[instance_id][2] = x2
294
+ results.pred_boxes.tensor[instance_id][3] = y2
295
+
296
+ results.ori_scores = ori_scores
297
+ results = results[has_mask_valid]
298
+ return results
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .swin import build_retinanet_swin_fpn_backbone
2
+ from .mixvision import build_retinanet_mit_fpn_backbone
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/mixvision.py ADDED
@@ -0,0 +1,464 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from functools import partial
5
+
6
+ from timm.models.layers import DropPath, to_2tuple, trunc_normal_
7
+ from timm.models.registry import register_model
8
+ from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool, LastLevelP6P7
9
+ import math
10
+
11
+ from detectron2.layers import ShapeSpec
12
+ from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
13
+
14
+ class Mlp(nn.Module):
15
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
16
+ super().__init__()
17
+ out_features = out_features or in_features
18
+ hidden_features = hidden_features or in_features
19
+ self.fc1 = nn.Linear(in_features, hidden_features)
20
+ self.dwconv = DWConv(hidden_features)
21
+ self.act = act_layer()
22
+ self.fc2 = nn.Linear(hidden_features, out_features)
23
+ self.drop = nn.Dropout(drop)
24
+
25
+ self.apply(self._init_weights)
26
+
27
+ def _init_weights(self, m):
28
+ if isinstance(m, nn.Linear):
29
+ trunc_normal_(m.weight, std=.02)
30
+ if isinstance(m, nn.Linear) and m.bias is not None:
31
+ nn.init.constant_(m.bias, 0)
32
+ elif isinstance(m, nn.LayerNorm):
33
+ nn.init.constant_(m.bias, 0)
34
+ nn.init.constant_(m.weight, 1.0)
35
+ elif isinstance(m, nn.Conv2d):
36
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
37
+ fan_out //= m.groups
38
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
39
+ if m.bias is not None:
40
+ m.bias.data.zero_()
41
+
42
+ def forward(self, x, H, W):
43
+ x = self.fc1(x)
44
+ x = self.dwconv(x, H, W)
45
+ x = self.act(x)
46
+ x = self.drop(x)
47
+ x = self.fc2(x)
48
+ x = self.drop(x)
49
+ return x
50
+
51
+
52
+ class Attention(nn.Module):
53
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
54
+ super().__init__()
55
+ assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
56
+
57
+ self.dim = dim
58
+ self.num_heads = num_heads
59
+ head_dim = dim // num_heads
60
+ self.scale = qk_scale or head_dim ** -0.5
61
+
62
+ self.q = nn.Linear(dim, dim, bias=qkv_bias)
63
+ self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
64
+ self.attn_drop = nn.Dropout(attn_drop)
65
+ self.proj = nn.Linear(dim, dim)
66
+ self.proj_drop = nn.Dropout(proj_drop)
67
+
68
+ self.sr_ratio = sr_ratio
69
+ if sr_ratio > 1:
70
+ self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
71
+ self.norm = nn.LayerNorm(dim)
72
+
73
+ self.apply(self._init_weights)
74
+
75
+ def _init_weights(self, m):
76
+ if isinstance(m, nn.Linear):
77
+ trunc_normal_(m.weight, std=.02)
78
+ if isinstance(m, nn.Linear) and m.bias is not None:
79
+ nn.init.constant_(m.bias, 0)
80
+ elif isinstance(m, nn.LayerNorm):
81
+ nn.init.constant_(m.bias, 0)
82
+ nn.init.constant_(m.weight, 1.0)
83
+ elif isinstance(m, nn.Conv2d):
84
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
85
+ fan_out //= m.groups
86
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
87
+ if m.bias is not None:
88
+ m.bias.data.zero_()
89
+
90
+ def forward(self, x, H, W):
91
+ B, N, C = x.shape
92
+ q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
93
+
94
+ if self.sr_ratio > 1:
95
+ x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
96
+ x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
97
+ x_ = self.norm(x_)
98
+ kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
99
+ else:
100
+ kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
101
+ k, v = kv[0], kv[1]
102
+
103
+ attn = (q @ k.transpose(-2, -1)) * self.scale
104
+ attn = attn.softmax(dim=-1)
105
+ attn = self.attn_drop(attn)
106
+
107
+ x = (attn @ v).transpose(1, 2).contiguous().reshape(B, N, C)
108
+ x = self.proj(x)
109
+ x = self.proj_drop(x)
110
+ return x
111
+
112
+ class Block(nn.Module):
113
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
114
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
115
+ super().__init__()
116
+ self.norm1 = norm_layer(dim)
117
+ self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
118
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
119
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
120
+ self.norm2 = norm_layer(dim)
121
+ mlp_hidden_dim = int(dim * mlp_ratio)
122
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
123
+
124
+ self.apply(self._init_weights)
125
+
126
+ def _init_weights(self, m):
127
+ if isinstance(m, nn.Linear):
128
+ trunc_normal_(m.weight, std=.02)
129
+ if isinstance(m, nn.Linear) and m.bias is not None:
130
+ nn.init.constant_(m.bias, 0)
131
+ elif isinstance(m, nn.LayerNorm):
132
+ nn.init.constant_(m.bias, 0)
133
+ nn.init.constant_(m.weight, 1.0)
134
+ elif isinstance(m, nn.Conv2d):
135
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
136
+ fan_out //= m.groups
137
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
138
+ if m.bias is not None:
139
+ m.bias.data.zero_()
140
+
141
+ def forward(self, x, H, W):
142
+ x = x + self.drop_path(self.attn(self.norm1(x), H, W))
143
+ x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
144
+
145
+ return x
146
+
147
+
148
+ class OverlapPatchEmbed(nn.Module):
149
+ """ Image to Patch Embedding
150
+ """
151
+
152
+ def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
153
+ super().__init__()
154
+ img_size = to_2tuple(img_size)
155
+ patch_size = to_2tuple(patch_size)
156
+
157
+ self.img_size = img_size
158
+ self.patch_size = patch_size
159
+ self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
160
+ self.num_patches = self.H * self.W
161
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
162
+ padding=(patch_size[0] // 2, patch_size[1] // 2))
163
+ self.norm = nn.LayerNorm(embed_dim)
164
+
165
+ self.apply(self._init_weights)
166
+
167
+ def _init_weights(self, m):
168
+ if isinstance(m, nn.Linear):
169
+ trunc_normal_(m.weight, std=.02)
170
+ if isinstance(m, nn.Linear) and m.bias is not None:
171
+ nn.init.constant_(m.bias, 0)
172
+ elif isinstance(m, nn.LayerNorm):
173
+ nn.init.constant_(m.bias, 0)
174
+ nn.init.constant_(m.weight, 1.0)
175
+ elif isinstance(m, nn.Conv2d):
176
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
177
+ fan_out //= m.groups
178
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
179
+ if m.bias is not None:
180
+ m.bias.data.zero_()
181
+
182
+ def forward(self, x):
183
+ x = self.proj(x)
184
+ _, _, H, W = x.shape
185
+ x = x.flatten(2).transpose(1, 2)
186
+ x = self.norm(x)
187
+
188
+ return x, H, W
189
+
190
+
191
+ class MixVisionTransformer(Backbone):
192
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
193
+ num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
194
+ attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
195
+ depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1]):
196
+ super().__init__()
197
+ self.num_classes = num_classes
198
+ self.depths = depths
199
+
200
+ # patch_embed
201
+ self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans,
202
+ embed_dim=embed_dims[0])
203
+ self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dims[0],
204
+ embed_dim=embed_dims[1])
205
+ self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dims[1],
206
+ embed_dim=embed_dims[2])
207
+ self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dims[2],
208
+ embed_dim=embed_dims[3])
209
+
210
+ # transformer encoder
211
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
212
+ cur = 0
213
+ self.block1 = nn.ModuleList([Block(
214
+ dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
215
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
216
+ sr_ratio=sr_ratios[0])
217
+ for i in range(depths[0])])
218
+ self.norm1 = norm_layer(embed_dims[0])
219
+
220
+ cur += depths[0]
221
+ self.block2 = nn.ModuleList([Block(
222
+ dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
223
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
224
+ sr_ratio=sr_ratios[1])
225
+ for i in range(depths[1])])
226
+ self.norm2 = norm_layer(embed_dims[1])
227
+
228
+ cur += depths[1]
229
+ self.block3 = nn.ModuleList([Block(
230
+ dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
231
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
232
+ sr_ratio=sr_ratios[2])
233
+ for i in range(depths[2])])
234
+ self.norm3 = norm_layer(embed_dims[2])
235
+
236
+ cur += depths[2]
237
+ self.block4 = nn.ModuleList([Block(
238
+ dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
239
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
240
+ sr_ratio=sr_ratios[3])
241
+ for i in range(depths[3])])
242
+ self.norm4 = norm_layer(embed_dims[3])
243
+
244
+ # classification head
245
+ # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
246
+
247
+ self.apply(self._init_weights)
248
+
249
+
250
+ # freeze
251
+ for p in self.patch_embed1.parameters():
252
+ p.requires_grad = False
253
+ for p in self.block1.parameters():
254
+ p.requires_grad = False
255
+ for p in self.norm1.parameters():
256
+ p.requires_grad = False
257
+
258
+ outs = self.forward(torch.rand(1,3,224,224).float())
259
+ self.output_shapes = dict()
260
+ self._size_divisibility = 0
261
+ for i, f in enumerate(outs):
262
+ self.output_shapes[f] = ShapeSpec(
263
+ channels=outs[f].shape[1], stride=224//outs[f].shape[2]
264
+ )
265
+ if i == (len(outs)-1):
266
+ self._size_divisibility = 224//outs[f].shape[2]
267
+
268
+ self.train()
269
+
270
+ def output_shape(self):
271
+ return self.output_shapes
272
+
273
+ def _init_weights(self, m):
274
+ if isinstance(m, nn.Linear):
275
+ trunc_normal_(m.weight, std=.02)
276
+ if isinstance(m, nn.Linear) and m.bias is not None:
277
+ nn.init.constant_(m.bias, 0)
278
+ elif isinstance(m, nn.LayerNorm):
279
+ nn.init.constant_(m.bias, 0)
280
+ nn.init.constant_(m.weight, 1.0)
281
+ elif isinstance(m, nn.Conv2d):
282
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
283
+ fan_out //= m.groups
284
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
285
+ if m.bias is not None:
286
+ m.bias.data.zero_()
287
+
288
+ def reset_drop_path(self, drop_path_rate):
289
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
290
+ cur = 0
291
+ for i in range(self.depths[0]):
292
+ self.block1[i].drop_path.drop_prob = dpr[cur + i]
293
+
294
+ cur += self.depths[0]
295
+ for i in range(self.depths[1]):
296
+ self.block2[i].drop_path.drop_prob = dpr[cur + i]
297
+
298
+ cur += self.depths[1]
299
+ for i in range(self.depths[2]):
300
+ self.block3[i].drop_path.drop_prob = dpr[cur + i]
301
+
302
+ cur += self.depths[2]
303
+ for i in range(self.depths[3]):
304
+ self.block4[i].drop_path.drop_prob = dpr[cur + i]
305
+
306
+ def freeze_patch_emb(self):
307
+ self.patch_embed1.requires_grad = False
308
+
309
+ @torch.jit.ignore
310
+ def no_weight_decay(self):
311
+ return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better
312
+
313
+ def get_classifier(self):
314
+ return self.head
315
+
316
+ def reset_classifier(self, num_classes, global_pool=''):
317
+ self.num_classes = num_classes
318
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
319
+
320
+ def forward_features(self, x):
321
+ B = x.shape[0]
322
+ outs = dict()
323
+
324
+ # stage 1
325
+ x, H, W = self.patch_embed1(x)
326
+ for i, blk in enumerate(self.block1):
327
+ x = blk(x, H, W)
328
+ x = self.norm1(x)
329
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
330
+ outs["mit1"] = x
331
+
332
+ # stage 2
333
+ x, H, W = self.patch_embed2(x)
334
+ for i, blk in enumerate(self.block2):
335
+ x = blk(x, H, W)
336
+ x = self.norm2(x)
337
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
338
+ outs["mit2"] = x
339
+
340
+ # stage 3
341
+ x, H, W = self.patch_embed3(x)
342
+ for i, blk in enumerate(self.block3):
343
+ x = blk(x, H, W)
344
+ x = self.norm3(x)
345
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
346
+ outs["mit3"] = x
347
+
348
+ # stage 4
349
+ x, H, W = self.patch_embed4(x)
350
+ for i, blk in enumerate(self.block4):
351
+ x = blk(x, H, W)
352
+ x = self.norm4(x)
353
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
354
+ outs["mit4"] = x
355
+
356
+ return outs
357
+
358
+ def forward(self, x):
359
+ x = self.forward_features(x)
360
+ # x = self.head(x)
361
+
362
+ return x
363
+
364
+
365
+ class DWConv(nn.Module):
366
+ def __init__(self, dim=768):
367
+ super(DWConv, self).__init__()
368
+ self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
369
+
370
+ def forward(self, x, H, W):
371
+ B, N, C = x.shape
372
+ x = x.transpose(1, 2).contiguous().view(B, C, H, W)
373
+ x = self.dwconv(x)
374
+ x = x.flatten(2).transpose(1, 2)
375
+
376
+ return x
377
+
378
+
379
+
380
+ class mit_b0(MixVisionTransformer):
381
+ def __init__(self, **kwargs):
382
+ super(mit_b0, self).__init__(
383
+ patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
384
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
385
+ drop_rate=0.0, drop_path_rate=0.1)
386
+
387
+
388
+ class mit_b1(MixVisionTransformer):
389
+ def __init__(self, **kwargs):
390
+ super(mit_b1, self).__init__(
391
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
392
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
393
+ drop_rate=0.0, drop_path_rate=0.1)
394
+
395
+
396
+ class mit_b2(MixVisionTransformer):
397
+ def __init__(self, **kwargs):
398
+ super(mit_b2, self).__init__(
399
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
400
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
401
+ drop_rate=0.0, drop_path_rate=0.1)
402
+
403
+
404
+ class mit_b3(MixVisionTransformer):
405
+ def __init__(self, **kwargs):
406
+ super(mit_b3, self).__init__(
407
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
408
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
409
+ drop_rate=0.0, drop_path_rate=0.1)
410
+
411
+
412
+ class mit_b4(MixVisionTransformer):
413
+ def __init__(self, **kwargs):
414
+ super(mit_b4, self).__init__(
415
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
416
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
417
+ drop_rate=0.0, drop_path_rate=0.1)
418
+
419
+
420
+ class mit_b5(MixVisionTransformer):
421
+ def __init__(self, **kwargs):
422
+ super(mit_b5, self).__init__(
423
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
424
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
425
+ drop_rate=0.0, drop_path_rate=0.1)
426
+
427
+ @BACKBONE_REGISTRY.register()
428
+ def build_mit_backbone(cfg, input_shape):
429
+ if cfg.MODEL.MIT_BACKBONE.NAME == "b0":
430
+ return mit_b0()
431
+ elif cfg.MODEL.MIT_BACKBONE.NAME == "b1":
432
+ return mit_b1()
433
+ elif cfg.MODEL.MIT_BACKBONE.NAME == "b2":
434
+ return mit_b2()
435
+ elif cfg.MODEL.MIT_BACKBONE.NAME == "b3":
436
+ return mit_b3()
437
+ elif cfg.MODEL.MIT_BACKBONE.NAME == "b4":
438
+ return mit_b4()
439
+ elif cfg.MODEL.MIT_BACKBONE.NAME == "b5":
440
+ return mit_b5()
441
+
442
+ @BACKBONE_REGISTRY.register()
443
+ def build_retinanet_mit_fpn_backbone(cfg, input_shape: ShapeSpec):
444
+ """
445
+ Args:
446
+ cfg: a detectron2 CfgNode
447
+ Returns:
448
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
449
+ """
450
+ bottom_up = build_mit_backbone(cfg, input_shape)
451
+ in_features = cfg.MODEL.FPN.IN_FEATURES
452
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
453
+ in_channels_top = out_channels
454
+ top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
455
+ backbone = FPN(
456
+ bottom_up=bottom_up,
457
+ in_features=in_features,
458
+ out_channels=out_channels,
459
+ norm=cfg.MODEL.FPN.NORM,
460
+ top_block=top_block,
461
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
462
+ )
463
+ return backbone
464
+
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/backbone/swin.py ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Swin Transformer
3
+ # modified from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py
4
+ # --------------------------------------------------------
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ import torch.utils.checkpoint as checkpoint
10
+ import numpy as np
11
+ from timm.models.layers import DropPath, to_2tuple, trunc_normal_
12
+
13
+ from detectron2.modeling.backbone import Backbone
14
+ from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
15
+ from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool, LastLevelP6P7
16
+ from detectron2.layers import ShapeSpec
17
+
18
+
19
+ class Mlp(nn.Module):
20
+ """ Multilayer perceptron."""
21
+
22
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
23
+ super().__init__()
24
+ out_features = out_features or in_features
25
+ hidden_features = hidden_features or in_features
26
+ self.fc1 = nn.Linear(in_features, hidden_features)
27
+ self.act = act_layer()
28
+ self.fc2 = nn.Linear(hidden_features, out_features)
29
+ self.drop = nn.Dropout(drop)
30
+
31
+ def forward(self, x):
32
+ x = self.fc1(x)
33
+ x = self.act(x)
34
+ x = self.drop(x)
35
+ x = self.fc2(x)
36
+ x = self.drop(x)
37
+ return x
38
+
39
+
40
+ def window_partition(x, window_size):
41
+ """
42
+ Args:
43
+ x: (B, H, W, C)
44
+ window_size (int): window size
45
+ Returns:
46
+ windows: (num_windows*B, window_size, window_size, C)
47
+ """
48
+ B, H, W, C = x.shape
49
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
50
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
51
+ return windows
52
+
53
+
54
+ def window_reverse(windows, window_size, H, W):
55
+ """
56
+ Args:
57
+ windows: (num_windows*B, window_size, window_size, C)
58
+ window_size (int): Window size
59
+ H (int): Height of image
60
+ W (int): Width of image
61
+ Returns:
62
+ x: (B, H, W, C)
63
+ """
64
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
65
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
66
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
67
+ return x
68
+
69
+
70
+ class WindowAttention(nn.Module):
71
+ """ Window based multi-head self attention (W-MSA) module with relative position bias.
72
+ It supports both of shifted and non-shifted window.
73
+ Args:
74
+ dim (int): Number of input channels.
75
+ window_size (tuple[int]): The height and width of the window.
76
+ num_heads (int): Number of attention heads.
77
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
78
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
79
+ attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
80
+ proj_drop (float, optional): Dropout ratio of output. Default: 0.0
81
+ """
82
+
83
+ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
84
+
85
+ super().__init__()
86
+ self.dim = dim
87
+ self.window_size = window_size # Wh, Ww
88
+ self.num_heads = num_heads
89
+ head_dim = dim // num_heads
90
+ self.scale = qk_scale or head_dim ** -0.5
91
+
92
+ # define a parameter table of relative position bias
93
+ self.relative_position_bias_table = nn.Parameter(
94
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
95
+
96
+ # get pair-wise relative position index for each token inside the window
97
+ coords_h = torch.arange(self.window_size[0])
98
+ coords_w = torch.arange(self.window_size[1])
99
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
100
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
101
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
102
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
103
+ relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
104
+ relative_coords[:, :, 1] += self.window_size[1] - 1
105
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
106
+ relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
107
+ self.register_buffer("relative_position_index", relative_position_index)
108
+
109
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
110
+ self.attn_drop = nn.Dropout(attn_drop)
111
+ self.proj = nn.Linear(dim, dim)
112
+ self.proj_drop = nn.Dropout(proj_drop)
113
+
114
+ trunc_normal_(self.relative_position_bias_table, std=.02)
115
+ self.softmax = nn.Softmax(dim=-1)
116
+
117
+ def forward(self, x, mask=None):
118
+ """ Forward function.
119
+ Args:
120
+ x: input features with shape of (num_windows*B, N, C)
121
+ mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
122
+ """
123
+ B_, N, C = x.shape
124
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
125
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
126
+
127
+ q = q * self.scale
128
+ attn = (q @ k.transpose(-2, -1))
129
+
130
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
131
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
132
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
133
+ attn = attn + relative_position_bias.unsqueeze(0)
134
+
135
+ if mask is not None:
136
+ nW = mask.shape[0]
137
+ attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
138
+ attn = attn.view(-1, self.num_heads, N, N)
139
+ attn = self.softmax(attn)
140
+ else:
141
+ attn = self.softmax(attn)
142
+
143
+ attn = self.attn_drop(attn)
144
+
145
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
146
+ x = self.proj(x)
147
+ x = self.proj_drop(x)
148
+ return x
149
+
150
+
151
+ class SwinTransformerBlock(nn.Module):
152
+ """ Swin Transformer Block.
153
+ Args:
154
+ dim (int): Number of input channels.
155
+ num_heads (int): Number of attention heads.
156
+ window_size (int): Window size.
157
+ shift_size (int): Shift size for SW-MSA.
158
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
159
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
160
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
161
+ drop (float, optional): Dropout rate. Default: 0.0
162
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
163
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
164
+ act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
165
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
166
+ """
167
+
168
+ def __init__(self, dim, num_heads, window_size=7, shift_size=0,
169
+ mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
170
+ act_layer=nn.GELU, norm_layer=nn.LayerNorm):
171
+ super().__init__()
172
+ self.dim = dim
173
+ self.num_heads = num_heads
174
+ self.window_size = window_size
175
+ self.shift_size = shift_size
176
+ self.mlp_ratio = mlp_ratio
177
+ assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
178
+
179
+ self.norm1 = norm_layer(dim)
180
+ self.attn = WindowAttention(
181
+ dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
182
+ qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
183
+
184
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
185
+ self.norm2 = norm_layer(dim)
186
+ mlp_hidden_dim = int(dim * mlp_ratio)
187
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
188
+
189
+ self.H = None
190
+ self.W = None
191
+
192
+ def forward(self, x, mask_matrix):
193
+ """ Forward function.
194
+ Args:
195
+ x: Input feature, tensor size (B, H*W, C).
196
+ H, W: Spatial resolution of the input feature.
197
+ mask_matrix: Attention mask for cyclic shift.
198
+ """
199
+ B, L, C = x.shape
200
+ H, W = self.H, self.W
201
+ assert L == H * W, "input feature has wrong size"
202
+
203
+ shortcut = x
204
+ x = self.norm1(x)
205
+ x = x.view(B, H, W, C)
206
+
207
+ # pad feature maps to multiples of window size
208
+ pad_l = pad_t = 0
209
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
210
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
211
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
212
+ _, Hp, Wp, _ = x.shape
213
+
214
+ # cyclic shift
215
+ if self.shift_size > 0:
216
+ shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
217
+ attn_mask = mask_matrix
218
+ else:
219
+ shifted_x = x
220
+ attn_mask = None
221
+
222
+ # partition windows
223
+ x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
224
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
225
+
226
+ # W-MSA/SW-MSA
227
+ attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
228
+
229
+ # merge windows
230
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
231
+ shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
232
+
233
+ # reverse cyclic shift
234
+ if self.shift_size > 0:
235
+ x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
236
+ else:
237
+ x = shifted_x
238
+
239
+ if pad_r > 0 or pad_b > 0:
240
+ x = x[:, :H, :W, :].contiguous()
241
+
242
+ x = x.view(B, H * W, C)
243
+
244
+ # FFN
245
+ x = shortcut + self.drop_path(x)
246
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
247
+
248
+ return x
249
+
250
+
251
+ class PatchMerging(nn.Module):
252
+ """ Patch Merging Layer
253
+ Args:
254
+ dim (int): Number of input channels.
255
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
256
+ """
257
+ def __init__(self, dim, norm_layer=nn.LayerNorm):
258
+ super().__init__()
259
+ self.dim = dim
260
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
261
+ self.norm = norm_layer(4 * dim)
262
+
263
+ def forward(self, x, H, W):
264
+ """ Forward function.
265
+ Args:
266
+ x: Input feature, tensor size (B, H*W, C).
267
+ H, W: Spatial resolution of the input feature.
268
+ """
269
+ B, L, C = x.shape
270
+ assert L == H * W, "input feature has wrong size"
271
+
272
+ x = x.view(B, H, W, C)
273
+
274
+ # padding
275
+ pad_input = (H % 2 == 1) or (W % 2 == 1)
276
+ if pad_input:
277
+ x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
278
+
279
+ x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
280
+ x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
281
+ x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
282
+ x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
283
+ x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
284
+ x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
285
+
286
+ x = self.norm(x)
287
+ x = self.reduction(x)
288
+
289
+ return x
290
+
291
+
292
+ class BasicLayer(nn.Module):
293
+ """ A basic Swin Transformer layer for one stage.
294
+ Args:
295
+ dim (int): Number of feature channels
296
+ depth (int): Depths of this stage.
297
+ num_heads (int): Number of attention head.
298
+ window_size (int): Local window size. Default: 7.
299
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
300
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
301
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
302
+ drop (float, optional): Dropout rate. Default: 0.0
303
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
304
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
305
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
306
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
307
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
308
+ """
309
+
310
+ def __init__(self,
311
+ dim,
312
+ depth,
313
+ num_heads,
314
+ window_size=7,
315
+ mlp_ratio=4.,
316
+ qkv_bias=True,
317
+ qk_scale=None,
318
+ drop=0.,
319
+ attn_drop=0.,
320
+ drop_path=0.,
321
+ norm_layer=nn.LayerNorm,
322
+ downsample=None,
323
+ use_checkpoint=False):
324
+ super().__init__()
325
+ self.window_size = window_size
326
+ self.shift_size = window_size // 2
327
+ self.depth = depth
328
+ self.use_checkpoint = use_checkpoint
329
+
330
+ # build blocks
331
+ self.blocks = nn.ModuleList([
332
+ SwinTransformerBlock(
333
+ dim=dim,
334
+ num_heads=num_heads,
335
+ window_size=window_size,
336
+ shift_size=0 if (i % 2 == 0) else window_size // 2,
337
+ mlp_ratio=mlp_ratio,
338
+ qkv_bias=qkv_bias,
339
+ qk_scale=qk_scale,
340
+ drop=drop,
341
+ attn_drop=attn_drop,
342
+ drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
343
+ norm_layer=norm_layer)
344
+ for i in range(depth)])
345
+
346
+ # patch merging layer
347
+ if downsample is not None:
348
+ self.downsample = downsample(dim=dim, norm_layer=norm_layer)
349
+ else:
350
+ self.downsample = None
351
+
352
+ def forward(self, x, H, W):
353
+ """ Forward function.
354
+ Args:
355
+ x: Input feature, tensor size (B, H*W, C).
356
+ H, W: Spatial resolution of the input feature.
357
+ """
358
+
359
+ # calculate attention mask for SW-MSA
360
+ Hp = int(np.ceil(H / self.window_size)) * self.window_size
361
+ Wp = int(np.ceil(W / self.window_size)) * self.window_size
362
+ img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
363
+ h_slices = (slice(0, -self.window_size),
364
+ slice(-self.window_size, -self.shift_size),
365
+ slice(-self.shift_size, None))
366
+ w_slices = (slice(0, -self.window_size),
367
+ slice(-self.window_size, -self.shift_size),
368
+ slice(-self.shift_size, None))
369
+ cnt = 0
370
+ for h in h_slices:
371
+ for w in w_slices:
372
+ img_mask[:, h, w, :] = cnt
373
+ cnt += 1
374
+
375
+ mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
376
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
377
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
378
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
379
+
380
+ for blk in self.blocks:
381
+ blk.H, blk.W = H, W
382
+ if self.use_checkpoint:
383
+ x = checkpoint.checkpoint(blk, x, attn_mask)
384
+ else:
385
+ x = blk(x, attn_mask)
386
+ if self.downsample is not None:
387
+ x_down = self.downsample(x, H, W)
388
+ Wh, Ww = (H + 1) // 2, (W + 1) // 2
389
+ return x, H, W, x_down, Wh, Ww
390
+ else:
391
+ return x, H, W, x, H, W
392
+
393
+
394
+ class PatchEmbed(nn.Module):
395
+ """ Image to Patch Embedding
396
+ Args:
397
+ patch_size (int): Patch token size. Default: 4.
398
+ in_chans (int): Number of input image channels. Default: 3.
399
+ embed_dim (int): Number of linear projection output channels. Default: 96.
400
+ norm_layer (nn.Module, optional): Normalization layer. Default: None
401
+ """
402
+
403
+ def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
404
+ super().__init__()
405
+ patch_size = to_2tuple(patch_size)
406
+ self.patch_size = patch_size
407
+
408
+ self.in_chans = in_chans
409
+ self.embed_dim = embed_dim
410
+
411
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
412
+ if norm_layer is not None:
413
+ self.norm = norm_layer(embed_dim)
414
+ else:
415
+ self.norm = None
416
+
417
+ def forward(self, x):
418
+ """Forward function."""
419
+ # padding
420
+ _, _, H, W = x.size()
421
+ if W % self.patch_size[1] != 0:
422
+ x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
423
+ if H % self.patch_size[0] != 0:
424
+ x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
425
+
426
+ x = self.proj(x) # B C Wh Ww
427
+ if self.norm is not None:
428
+ Wh, Ww = x.size(2), x.size(3)
429
+ x = x.flatten(2).transpose(1, 2)
430
+ x = self.norm(x)
431
+ x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
432
+
433
+ return x
434
+
435
+
436
+ class SwinTransformer(Backbone):
437
+ """ Swin Transformer backbone.
438
+ A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
439
+ https://arxiv.org/pdf/2103.14030
440
+ Args:
441
+ pretrain_img_size (int): Input image size for training the pretrained model,
442
+ used in absolute postion embedding. Default 224.
443
+ patch_size (int | tuple(int)): Patch size. Default: 4.
444
+ in_chans (int): Number of input image channels. Default: 3.
445
+ embed_dim (int): Number of linear projection output channels. Default: 96.
446
+ depths (tuple[int]): Depths of each Swin Transformer stage.
447
+ num_heads (tuple[int]): Number of attention head of each stage.
448
+ window_size (int): Window size. Default: 7.
449
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
450
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
451
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
452
+ drop_rate (float): Dropout rate.
453
+ attn_drop_rate (float): Attention dropout rate. Default: 0.
454
+ drop_path_rate (float): Stochastic depth rate. Default: 0.2.
455
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
456
+ ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
457
+ patch_norm (bool): If True, add normalization after patch embedding. Default: True.
458
+ out_indices (Sequence[int]): Output from which stages.
459
+ frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
460
+ -1 means not freezing any parameters.
461
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
462
+ """
463
+
464
+ def __init__(self,
465
+ pretrain_img_size=224,
466
+ patch_size=4,
467
+ in_chans=3,
468
+ embed_dim=96,
469
+ depths=[2, 2, 6, 2],
470
+ num_heads=[3, 6, 12, 24],
471
+ window_size=7,
472
+ mlp_ratio=4.,
473
+ qkv_bias=True,
474
+ qk_scale=None,
475
+ drop_rate=0.,
476
+ attn_drop_rate=0.,
477
+ drop_path_rate=0.2,
478
+ norm_layer=nn.LayerNorm,
479
+ ape=False,
480
+ patch_norm=True,
481
+ frozen_stages=-1,
482
+ use_checkpoint=False,
483
+ out_features=None):
484
+ super(SwinTransformer, self).__init__()
485
+
486
+ self.pretrain_img_size = pretrain_img_size
487
+ self.num_layers = len(depths)
488
+ self.embed_dim = embed_dim
489
+ self.ape = ape
490
+ self.patch_norm = patch_norm
491
+ self.frozen_stages = frozen_stages
492
+
493
+ self.out_features = out_features
494
+
495
+ # split image into non-overlapping patches
496
+ self.patch_embed = PatchEmbed(
497
+ patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
498
+ norm_layer=norm_layer if self.patch_norm else None)
499
+
500
+ # absolute position embedding
501
+ if self.ape:
502
+ pretrain_img_size = to_2tuple(pretrain_img_size)
503
+ patch_size = to_2tuple(patch_size)
504
+ patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
505
+
506
+ self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
507
+ trunc_normal_(self.absolute_pos_embed, std=.02)
508
+
509
+ self.pos_drop = nn.Dropout(p=drop_rate)
510
+
511
+ # stochastic depth
512
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
513
+
514
+ self._out_feature_strides = {}
515
+ self._out_feature_channels = {}
516
+
517
+ # build layers
518
+ self.layers = nn.ModuleList()
519
+ for i_layer in range(self.num_layers):
520
+ layer = BasicLayer(
521
+ dim=int(embed_dim * 2 ** i_layer),
522
+ depth=depths[i_layer],
523
+ num_heads=num_heads[i_layer],
524
+ window_size=window_size,
525
+ mlp_ratio=mlp_ratio,
526
+ qkv_bias=qkv_bias,
527
+ qk_scale=qk_scale,
528
+ drop=drop_rate,
529
+ attn_drop=attn_drop_rate,
530
+ drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
531
+ norm_layer=norm_layer,
532
+ downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
533
+ use_checkpoint=use_checkpoint)
534
+ self.layers.append(layer)
535
+
536
+ stage = f'stage{i_layer+2}'
537
+ if stage in self.out_features:
538
+ self._out_feature_channels[stage] = embed_dim * 2 ** i_layer
539
+ self._out_feature_strides[stage] = 4 * 2 ** i_layer
540
+
541
+ num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
542
+ self.num_features = num_features
543
+
544
+ # add a norm layer for each output
545
+ for i_layer in range(self.num_layers):
546
+ stage = f'stage{i_layer+2}'
547
+ if stage in self.out_features:
548
+ layer = norm_layer(num_features[i_layer])
549
+ layer_name = f'norm{i_layer}'
550
+ self.add_module(layer_name, layer)
551
+
552
+ self._freeze_stages()
553
+
554
+ def _freeze_stages(self):
555
+ if self.frozen_stages >= 0:
556
+ self.patch_embed.eval()
557
+ for param in self.patch_embed.parameters():
558
+ param.requires_grad = False
559
+
560
+ if self.frozen_stages >= 1 and self.ape:
561
+ self.absolute_pos_embed.requires_grad = False
562
+
563
+ if self.frozen_stages >= 2:
564
+ self.pos_drop.eval()
565
+ for i in range(0, self.frozen_stages - 1):
566
+ m = self.layers[i]
567
+ m.eval()
568
+ for param in m.parameters():
569
+ param.requires_grad = False
570
+
571
+ def init_weights(self, pretrained=None):
572
+ """Initialize the weights in backbone.
573
+ Args:
574
+ pretrained (str, optional): Path to pre-trained weights.
575
+ Defaults to None.
576
+ """
577
+
578
+ def _init_weights(m):
579
+ if isinstance(m, nn.Linear):
580
+ trunc_normal_(m.weight, std=.02)
581
+ if isinstance(m, nn.Linear) and m.bias is not None:
582
+ nn.init.constant_(m.bias, 0)
583
+ elif isinstance(m, nn.LayerNorm):
584
+ nn.init.constant_(m.bias, 0)
585
+ nn.init.constant_(m.weight, 1.0)
586
+
587
+ self.apply(_init_weights)
588
+
589
+ def forward(self, x):
590
+ """Forward function."""
591
+ x = self.patch_embed(x)
592
+
593
+ Wh, Ww = x.size(2), x.size(3)
594
+ if self.ape:
595
+ # interpolate the position embedding to the corresponding size
596
+ absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
597
+ x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C
598
+ else:
599
+ x = x.flatten(2).transpose(1, 2)
600
+ x = self.pos_drop(x)
601
+
602
+ outs = {}
603
+ for i in range(self.num_layers):
604
+ layer = self.layers[i]
605
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
606
+ name = f'stage{i+2}'
607
+ if name in self.out_features:
608
+ norm_layer = getattr(self, f'norm{i}')
609
+ x_out = norm_layer(x_out)
610
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
611
+ outs[name] = out
612
+
613
+ return outs #{"stage%d" % (i+2,): out for i, out in enumerate(outs)} #tuple(outs)
614
+
615
+ def train(self, mode=True):
616
+ """Convert the model into training mode while keep layers freezed."""
617
+ super(SwinTransformer, self).train(mode)
618
+ self._freeze_stages()
619
+
620
+ def output_shape(self):
621
+ return {
622
+ name: ShapeSpec(
623
+ channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
624
+ )
625
+ for name in self.out_features
626
+ }
627
+
628
+ @BACKBONE_REGISTRY.register()
629
+ def build_swin_backbone(cfg, input_shape):
630
+ """
631
+ Create a SwinT instance from config.
632
+ Returns:
633
+ VoVNet: a :class:`VoVNet` instance.
634
+ """
635
+ out_features = cfg.MODEL.SWINT.OUT_FEATURES
636
+
637
+ return SwinTransformer(
638
+ patch_size=cfg.MODEL.SWINT.PATCH_SIZE,
639
+ in_chans=input_shape.channels,
640
+ embed_dim=cfg.MODEL.SWINT.EMBED_DIM,
641
+ depths=cfg.MODEL.SWINT.DEPTHS,
642
+ num_heads=cfg.MODEL.SWINT.NUM_HEADS,
643
+ window_size=cfg.MODEL.SWINT.WINDOW_SIZE,
644
+ mlp_ratio=cfg.MODEL.SWINT.MLP_RATIO,
645
+ qkv_bias=True,
646
+ qk_scale=None,
647
+ drop_rate=0.,
648
+ attn_drop_rate=0.,
649
+ drop_path_rate=cfg.MODEL.SWINT.DROP_PATH_RATE,
650
+ norm_layer=nn.LayerNorm,
651
+ ape=cfg.MODEL.SWINT.APE,
652
+ patch_norm=True,
653
+ frozen_stages=cfg.MODEL.BACKBONE.FREEZE_AT,
654
+ out_features=out_features
655
+ )
656
+
657
+
658
+ @BACKBONE_REGISTRY.register()
659
+ def build_swin_fpn_backbone(cfg, input_shape: ShapeSpec):
660
+ """
661
+ Args:
662
+ cfg: a detectron2 CfgNode
663
+ Returns:
664
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
665
+ """
666
+ bottom_up = build_swin_backbone(cfg, input_shape)
667
+ in_features = cfg.MODEL.FPN.IN_FEATURES
668
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
669
+ backbone = FPN(
670
+ bottom_up=bottom_up,
671
+ in_features=in_features,
672
+ out_channels=out_channels,
673
+ norm=cfg.MODEL.FPN.NORM,
674
+ top_block=LastLevelMaxPool(),
675
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
676
+ )
677
+ return backbone
678
+
679
+ class LastLevelP6(nn.Module):
680
+ """
681
+ This module is used in FCOS to generate extra layers
682
+ """
683
+
684
+ def __init__(self, in_channels, out_channels, in_features="res5"):
685
+ super().__init__()
686
+ self.num_levels = 1
687
+ self.in_feature = in_features
688
+ self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
689
+ for module in [self.p6]:
690
+ weight_init.c2_xavier_fill(module)
691
+
692
+ def forward(self, x):
693
+ p6 = self.p6(x)
694
+ return [p6]
695
+
696
+ @BACKBONE_REGISTRY.register()
697
+ def build_retinanet_swin_fpn_backbone(cfg, input_shape: ShapeSpec):
698
+ """
699
+ Args:
700
+ cfg: a detectron2 CfgNode
701
+ Returns:
702
+ backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
703
+ """
704
+ bottom_up = build_swin_backbone(cfg, input_shape)
705
+ in_features = cfg.MODEL.FPN.IN_FEATURES
706
+ out_channels = cfg.MODEL.FPN.OUT_CHANNELS
707
+ top_levels = cfg.MODEL.FPN.TOP_LEVELS
708
+ in_channels_top = out_channels
709
+ if top_levels == 2:
710
+ top_block = LastLevelP6P7(in_channels_top, out_channels, "p5")
711
+ if top_levels == 1:
712
+ top_block = LastLevelP6(in_channels_top, out_channels, "p5")
713
+ elif top_levels == 0:
714
+ top_block = None
715
+ backbone = FPN(
716
+ bottom_up=bottom_up,
717
+ in_features=in_features,
718
+ out_channels=out_channels,
719
+ norm=cfg.MODEL.FPN.NORM,
720
+ top_block=top_block,
721
+ fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
722
+ )
723
+ return backbone
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/config.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from detectron2.config import CfgNode as CN
2
+
3
+ def add_entity_config(cfg):
4
+ """
5
+ Add config for Item.
6
+ """
7
+ ## FCOS Hyper-Parameters
8
+ cfg.MODEL.FCOS = CN()
9
+
10
+ # Anchor parameters
11
+ cfg.MODEL.FCOS.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"]
12
+ cfg.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
13
+ cfg.MODEL.FCOS.NUM_CLASSES = 1
14
+ cfg.MODEL.FCOS.SIZES_OF_INTEREST = [[-1, 64], [64,128], [128,256], [256,512], [512, 100000000]]
15
+
16
+ # tower
17
+ cfg.MODEL.FCOS.NUM_CLS_CONVS = 4
18
+ cfg.MODEL.FCOS.NUM_BOX_CONVS = 4
19
+ cfg.MODEL.FCOS.NUM_SHARE_CONVS = 0
20
+ cfg.MODEL.FCOS.CENTER_SAMPLE = True
21
+ cfg.MODEL.FCOS.POS_RADIUS = 1.5
22
+ cfg.MODEL.FCOS.LOC_LOSS_TYPE = 'giou'
23
+ cfg.MODEL.FCOS.USE_RELU = True
24
+ cfg.MODEL.FCOS.USE_DEFORMABLE = False
25
+ cfg.MODEL.FCOS.USE_SCALE = True
26
+ cfg.MODEL.FCOS.TOP_LEVELS = 2
27
+ cfg.MODEL.FCOS.NORM = "GN"
28
+
29
+ # loss
30
+ cfg.MODEL.FCOS.PRIOR_PROB = 0.01
31
+ cfg.MODEL.FCOS.LOSS_ALPHA = 0.25
32
+ cfg.MODEL.FCOS.LOSS_GAMMA = 2.0
33
+ cfg.MODEL.FCOS.FB_RATIO = 4.0
34
+ cfg.MODEL.FCOS.CENTER_SAMPLE = True
35
+ cfg.MODEL.FCOS.YIELD_PROPOSAL = False
36
+
37
+ # inference
38
+ cfg.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.05
39
+ cfg.MODEL.FCOS.INFERENCE_TH_TEST = 0.05
40
+ cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN = 1000
41
+ cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST = 1000
42
+ cfg.MODEL.FCOS.NMS_TH = 0.6
43
+ cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN = 100
44
+ cfg.MODEL.FCOS.POST_NMS_TOPK_TEST = 100
45
+ cfg.MODEL.FCOS.THRESH_WITH_CTR = False
46
+
47
+
48
+ ## CONDINST Hyper-Parameters
49
+ cfg.MODEL.CONDINST = CN()
50
+ # the downsampling ratio of the final instance masks to the input image
51
+ cfg.MODEL.CONDINST.MASK_OUT_STRIDE = 4
52
+ cfg.MODEL.CONDINST.MAX_PROPOSALS = 500
53
+ cfg.MODEL.CONDINST.TRAIN_MAX_PROPOSALS_PER_IMAGE = 120
54
+ cfg.MODEL.CONDINST.LOW_LEVEL_DIMENSION = 16
55
+ cfg.MODEL.CONDINST.CLASS_AGNOSTIC = False
56
+
57
+ cfg.MODEL.CONDINST.MASK_HEAD = CN()
58
+ cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS = 8
59
+ cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS = 3
60
+ cfg.MODEL.CONDINST.MASK_HEAD.USE_FP16 = False
61
+ cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS = False
62
+ cfg.MODEL.CONDINST.MASK_HEAD.CLUSTER_WEIGHT = 1.0
63
+ cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC = ["111", "110"]
64
+ cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC_WEIGHT = [1.0, 1.0]
65
+
66
+ cfg.MODEL.CONDINST.MASK_BRANCH = CN()
67
+ cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS = 8
68
+ cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES = ["p3", "p4", "p5"]
69
+ cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS = 128
70
+ cfg.MODEL.CONDINST.MASK_BRANCH.NORM = "BN"
71
+ cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS = 4
72
+ cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON = False
73
+ cfg.MODEL.CONDINST.MASK_BRANCH.USE_MASK_RESCORE = False
74
+ ## kernel head
75
+ cfg.MODEL.KERNEL_HEAD = CN()
76
+ cfg.MODEL.KERNEL_HEAD.NUM_CONVS = 3
77
+ cfg.MODEL.KERNEL_HEAD.DEFORM = False
78
+ cfg.MODEL.KERNEL_HEAD.COORD = True
79
+ cfg.MODEL.KERNEL_HEAD.CONVS_DIM = 256
80
+ cfg.MODEL.KERNEL_HEAD.NORM = "GN"
81
+
82
+ ## swin transformer backbone
83
+ cfg.MODEL.SWINT = CN()
84
+ cfg.MODEL.SWINT.EMBED_DIM = 96
85
+ cfg.MODEL.SWINT.PATCH_SIZE = 4
86
+ cfg.MODEL.SWINT.OUT_FEATURES = ["stage2", "stage3", "stage4", "stage5"]
87
+ cfg.MODEL.SWINT.DEPTHS = [2, 2, 6, 2]
88
+ cfg.MODEL.SWINT.NUM_HEADS = [3, 6, 12, 24]
89
+ cfg.MODEL.SWINT.WINDOW_SIZE = 7
90
+ cfg.MODEL.SWINT.MLP_RATIO = 4
91
+ cfg.MODEL.SWINT.DROP_PATH_RATE = 0.2
92
+ cfg.MODEL.SWINT.APE = False
93
+
94
+ # # addation
95
+ cfg.MODEL.FPN.TOP_LEVELS = 2
96
+
97
+ ## mit former
98
+ cfg.MODEL.MIT_BACKBONE = CN()
99
+ cfg.MODEL.MIT_BACKBONE.NAME = "b0"
100
+
101
+ cfg.SOLVER.OPTIMIZER = "sgd"
102
+ cfg.TEST.CLASS_AGNOSTIC = True
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/__init__.py ADDED
File without changes
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/detection.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, Dict
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ from detectron2.structures import ImageList
8
+ from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
9
+ from detectron2.modeling.backbone import build_backbone
10
+ from detectron2.layers import ShapeSpec
11
+ from detectron2.modeling.postprocessing import detector_postprocess
12
+
13
+ from .layers import DFConv2d, IOULoss
14
+ # from .outputs_has_ignore import FCOSOutputs
15
+ from .outputs import FCOSOutputs
16
+ from .tower import FCOSHead
17
+
18
+ import pdb
19
+ import cv2
20
+
21
+ INF = 100000000
22
+
23
+ class FCOS(nn.Module):
24
+ def __init__(self, cfg, backbone_shape):
25
+ super().__init__()
26
+
27
+ self.device = torch.device(cfg.MODEL.DEVICE)
28
+ self.in_features = cfg.MODEL.FCOS.IN_FEATURES
29
+ self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
30
+ self.yield_proposal = cfg.MODEL.FCOS.YIELD_PROPOSAL
31
+
32
+ feature_shapes = [backbone_shape[f] for f in self.in_features]
33
+ self.fcos_head = FCOSHead(cfg, feature_shapes)
34
+ self.in_channels_to_top_module = self.fcos_head.in_channels_to_top_module
35
+ self.fcos_outputs = FCOSOutputs(cfg)
36
+ self.to(self.device)
37
+
38
+ def forward_head(self, features, top_module=None):
39
+ features = [features[f] for f in self.in_features]
40
+ pred_class_logits, pred_deltas, pred_centerness, bbox_towers, top_feats = self.fcos_head(features, top_module, self.yield_proposal)
41
+ return pred_class_logits, pred_deltas, pred_centerness, bbox_towers, top_feats
42
+
43
+ def forward(self, images, backbone_features, gt_instances, top_module=None):
44
+ """
45
+ Arguments:
46
+ images (list[Tensor] or ImageList): images to be processed
47
+ targets (list[BoxList]): ground-truth boxes present in the image (optional)
48
+ Returns:
49
+ result (list[BoxList] or dict[Tensor]): the output from the model.
50
+ During training, it returns a dict[Tensor] which contains the losses.
51
+ During testing, it returns list[BoxList] contains additional fields
52
+ like `scores`, `labels` and `mask` (for Mask R-CNN models).
53
+ """
54
+ features = [backbone_features[f] for f in self.in_features]
55
+ locations = self.compute_locations(features)
56
+ logits_pred, reg_pred, ctrness_pred, bbox_towers, top_feats = self.fcos_head(features, top_module)
57
+
58
+ results = {}
59
+ if self.yield_proposal:
60
+ results["features"] = {
61
+ f: b for f, b in zip(self.in_features, bbox_towers)
62
+ }
63
+
64
+ if self.training:
65
+ results, losses = self.fcos_outputs.losses(
66
+ logits_pred, reg_pred, ctrness_pred,
67
+ locations, gt_instances, top_feats
68
+ )
69
+
70
+ if self.yield_proposal:
71
+ with torch.no_grad():
72
+ results["proposals"] = self.fcos_outputs.predict_proposals(
73
+ logits_pred, reg_pred, ctrness_pred,
74
+ locations, images.image_sizes, top_feats
75
+ )
76
+ return results, losses
77
+ else:
78
+ results = self.fcos_outputs.predict_proposals(
79
+ logits_pred, reg_pred, ctrness_pred,
80
+ locations, images.image_sizes, top_feats
81
+ )
82
+
83
+ return results, {}
84
+
85
+ def compute_locations(self, features):
86
+ locations = []
87
+ for level, feature in enumerate(features):
88
+ h, w = feature.size()[-2:]
89
+ locations_per_level = self.compute_locations_per_level(
90
+ h, w, self.fpn_strides[level],
91
+ feature.device
92
+ )
93
+ locations.append(locations_per_level)
94
+ return locations
95
+
96
+ def compute_locations_per_level(self, h, w, stride, device):
97
+ shifts_x = torch.arange(
98
+ 0, w * stride, step=stride,
99
+ dtype=torch.float32, device=device
100
+ )
101
+ shifts_y = torch.arange(
102
+ 0, h * stride, step=stride,
103
+ dtype=torch.float32, device=device
104
+ )
105
+ shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
106
+ shift_x = shift_x.reshape(-1)
107
+ shift_y = shift_y.reshape(-1)
108
+ locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
109
+ return locations
110
+
111
+ def build_det_head(cfg, backbone_shape):
112
+ return FCOS(cfg, backbone_shape)
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .deform_conv import DFConv2d
2
+ from .iou_loss import IOULoss
3
+ from .ml_nms import ml_nms
4
+ from .conv_with_kaiming_uniform import *
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/conv_with_kaiming_uniform.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+
3
+ from detectron2.layers import Conv2d
4
+ from .deform_conv import DFConv2d
5
+ from detectron2.layers.batch_norm import get_norm
6
+
7
+
8
+ def conv_with_kaiming_uniform(
9
+ norm=None, activation=None,
10
+ use_deformable=False, use_sep=False):
11
+ def make_conv(
12
+ in_channels, out_channels, kernel_size, stride=1, dilation=1
13
+ ):
14
+ if use_deformable:
15
+ conv_func = DFConv2d
16
+ else:
17
+ conv_func = Conv2d
18
+ if use_sep:
19
+ assert in_channels == out_channels
20
+ groups = in_channels
21
+ else:
22
+ groups = 1
23
+ conv = conv_func(
24
+ in_channels,
25
+ out_channels,
26
+ kernel_size=kernel_size,
27
+ stride=stride,
28
+ padding=dilation * (kernel_size - 1) // 2,
29
+ dilation=dilation,
30
+ groups=groups,
31
+ bias=(norm is None)
32
+ )
33
+ if not use_deformable:
34
+ # Caffe2 implementation uses XavierFill, which in fact
35
+ # corresponds to kaiming_uniform_ in PyTorch
36
+ nn.init.kaiming_uniform_(conv.weight, a=1)
37
+ if norm is None:
38
+ nn.init.constant_(conv.bias, 0)
39
+ module = [conv,]
40
+ if norm is not None and len(norm) > 0:
41
+ if norm == "GN":
42
+ norm_module = nn.GroupNorm(32, out_channels)
43
+ else:
44
+ norm_module = get_norm(norm, out_channels)
45
+ module.append(norm_module)
46
+ if activation is not None:
47
+ module.append(nn.ReLU(inplace=True))
48
+ if len(module) > 1:
49
+ return nn.Sequential(*module)
50
+ return conv
51
+
52
+ return make_conv
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/deform_conv.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ from detectron2.layers import Conv2d
5
+
6
+ class _NewEmptyTensorOp(torch.autograd.Function):
7
+ @staticmethod
8
+ def forward(ctx, x, new_shape):
9
+ ctx.shape = x.shape
10
+ return x.new_empty(new_shape)
11
+
12
+ @staticmethod
13
+ def backward(ctx, grad):
14
+ shape = ctx.shape
15
+ return _NewEmptyTensorOp.apply(grad, shape), None
16
+
17
+
18
+ class DFConv2d(nn.Module):
19
+ """Deformable convolutional layer"""
20
+ def __init__(
21
+ self,
22
+ in_channels,
23
+ out_channels,
24
+ with_modulated_dcn=True,
25
+ kernel_size=3,
26
+ stride=1,
27
+ groups=1,
28
+ dilation=1,
29
+ deformable_groups=1,
30
+ bias=False,
31
+ padding=None
32
+ ):
33
+ super(DFConv2d, self).__init__()
34
+ if isinstance(kernel_size, (list, tuple)):
35
+ assert isinstance(stride, (list, tuple))
36
+ assert isinstance(dilation, (list, tuple))
37
+ assert len(kernel_size) == 2
38
+ assert len(stride) == 2
39
+ assert len(dilation) == 2
40
+ padding = (
41
+ dilation[0] * (kernel_size[0] - 1) // 2,
42
+ dilation[1] * (kernel_size[1] - 1) // 2
43
+ )
44
+ offset_base_channels = kernel_size[0] * kernel_size[1]
45
+ else:
46
+ padding = dilation * (kernel_size - 1) // 2
47
+ offset_base_channels = kernel_size * kernel_size
48
+ if with_modulated_dcn:
49
+ from .deform_conv import ModulatedDeformConv
50
+ offset_channels = offset_base_channels * 3 # default: 27
51
+ conv_block = ModulatedDeformConv
52
+ else:
53
+ from .deform_conv import DeformConv
54
+ offset_channels = offset_base_channels * 2 # default: 18
55
+ conv_block = DeformConv
56
+ self.offset = Conv2d(
57
+ in_channels,
58
+ deformable_groups * offset_channels,
59
+ kernel_size=kernel_size,
60
+ stride=stride,
61
+ padding=padding,
62
+ groups=1,
63
+ dilation=dilation
64
+ )
65
+ for l in [self.offset, ]:
66
+ nn.init.kaiming_uniform_(l.weight, a=1)
67
+ torch.nn.init.constant_(l.bias, 0.)
68
+ self.conv = conv_block(
69
+ in_channels,
70
+ out_channels,
71
+ kernel_size=kernel_size,
72
+ stride=stride,
73
+ padding=padding,
74
+ dilation=dilation,
75
+ groups=groups,
76
+ deformable_groups=deformable_groups,
77
+ bias=bias
78
+ )
79
+ self.with_modulated_dcn = with_modulated_dcn
80
+ self.kernel_size = kernel_size
81
+ self.stride = stride
82
+ self.padding = padding
83
+ self.dilation = dilation
84
+ self.offset_split = offset_base_channels * deformable_groups * 2
85
+
86
+ def forward(self, x, return_offset=False):
87
+ if x.numel() > 0:
88
+ if not self.with_modulated_dcn:
89
+ offset_mask = self.offset(x)
90
+ x = self.conv(x, offset_mask)
91
+ else:
92
+ offset_mask = self.offset(x)
93
+ offset = offset_mask[:, :self.offset_split, :, :]
94
+ mask = offset_mask[:, self.offset_split:, :, :].sigmoid()
95
+ x = self.conv(x, offset, mask)
96
+ if return_offset:
97
+ return x, offset_mask
98
+ return x
99
+ # get output shape
100
+ output_shape = [
101
+ (i + 2 * p - (di * (k - 1) + 1)) // d + 1
102
+ for i, p, di, k, d in zip(
103
+ x.shape[-2:],
104
+ self.padding,
105
+ self.dilation,
106
+ self.kernel_size,
107
+ self.stride
108
+ )
109
+ ]
110
+ output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape
111
+ return _NewEmptyTensorOp.apply(x, output_shape)
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/iou_loss.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+
5
+ class IOULoss(nn.Module):
6
+ def __init__(self, loc_loss_type='iou'):
7
+ super(IOULoss, self).__init__()
8
+ self.loc_loss_type = loc_loss_type
9
+
10
+ def forward(self, pred, target, weight=None):
11
+ pred_left = pred[:, 0]
12
+ pred_top = pred[:, 1]
13
+ pred_right = pred[:, 2]
14
+ pred_bottom = pred[:, 3]
15
+
16
+ target_left = target[:, 0]
17
+ target_top = target[:, 1]
18
+ target_right = target[:, 2]
19
+ target_bottom = target[:, 3]
20
+
21
+ target_aera = (target_left + target_right) * \
22
+ (target_top + target_bottom)
23
+ pred_aera = (pred_left + pred_right) * \
24
+ (pred_top + pred_bottom)
25
+
26
+ w_intersect = torch.min(pred_left, target_left) + \
27
+ torch.min(pred_right, target_right)
28
+ h_intersect = torch.min(pred_bottom, target_bottom) + \
29
+ torch.min(pred_top, target_top)
30
+
31
+ g_w_intersect = torch.max(pred_left, target_left) + \
32
+ torch.max(pred_right, target_right)
33
+ g_h_intersect = torch.max(pred_bottom, target_bottom) + \
34
+ torch.max(pred_top, target_top)
35
+ ac_uion = g_w_intersect * g_h_intersect
36
+
37
+ area_intersect = w_intersect * h_intersect
38
+ area_union = target_aera + pred_aera - area_intersect
39
+
40
+ ious = (area_intersect + 1.0) / (area_union + 1.0)
41
+ gious = ious - (ac_uion - area_union) / ac_uion
42
+ if self.loc_loss_type == 'iou':
43
+ losses = -torch.log(ious)
44
+ elif self.loc_loss_type == 'linear_iou':
45
+ losses = 1 - ious
46
+ elif self.loc_loss_type == 'giou':
47
+ losses = 1 - gious
48
+ else:
49
+ raise NotImplementedError
50
+
51
+ if weight is not None:
52
+ return (losses * weight).sum()
53
+ else:
54
+ return losses.sum()
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/layers/ml_nms.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from detectron2.layers import batched_nms
2
+
3
+
4
+ def ml_nms(boxlist, nms_thresh, max_proposals=-1,
5
+ score_field="scores", label_field="labels"):
6
+ """
7
+ Performs non-maximum suppression on a boxlist, with scores specified
8
+ in a boxlist field via score_field.
9
+
10
+ Args:
11
+ boxlist (detectron2.structures.Boxes):
12
+ nms_thresh (float):
13
+ max_proposals (int): if > 0, then only the top max_proposals are kept
14
+ after non-maximum suppression
15
+ score_field (str):
16
+ """
17
+ if nms_thresh <= 0:
18
+ return boxlist
19
+ boxes = boxlist.pred_boxes.tensor
20
+ scores = boxlist.scores
21
+ labels = boxlist.pred_classes
22
+ keep = batched_nms(boxes, scores, labels, nms_thresh)
23
+ if max_proposals > 0:
24
+ keep = keep[: max_proposals]
25
+ boxlist = boxlist[keep]
26
+ return boxlist
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/outputs.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import torch
3
+ from torch import nn
4
+ import torch.nn.functional as F
5
+
6
+ from detectron2.layers import cat
7
+ from detectron2.structures import Instances, Boxes
8
+ from detectron2.utils.comm import get_world_size
9
+ from fvcore.nn import sigmoid_focal_loss_jit
10
+
11
+ from .utils import reduce_sum
12
+ from .layers import ml_nms, IOULoss
13
+ import pdb
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ INF = 100000000
18
+
19
+ def compute_ctrness_targets(reg_targets):
20
+ if len(reg_targets) == 0:
21
+ return reg_targets.new_zeros(len(reg_targets))
22
+ left_right = reg_targets[:, [0, 2]]
23
+ top_bottom = reg_targets[:, [1, 3]]
24
+ ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
25
+ (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
26
+ return torch.sqrt(ctrness)
27
+
28
+ class FCOSOutputs(nn.Module):
29
+ def __init__(self, cfg):
30
+ super(FCOSOutputs, self).__init__()
31
+
32
+ self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA
33
+ self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA
34
+ self.center_sample = cfg.MODEL.FCOS.CENTER_SAMPLE
35
+ self.radius = cfg.MODEL.FCOS.POS_RADIUS
36
+ self.pre_nms_thresh_train = cfg.MODEL.FCOS.INFERENCE_TH_TRAIN
37
+ self.pre_nms_topk_train = cfg.MODEL.FCOS.PRE_NMS_TOPK_TRAIN
38
+ self.post_nms_topk_train = cfg.MODEL.FCOS.POST_NMS_TOPK_TRAIN
39
+ self.loc_loss_func = IOULoss(cfg.MODEL.FCOS.LOC_LOSS_TYPE)
40
+
41
+ self.pre_nms_thresh_test = cfg.MODEL.FCOS.INFERENCE_TH_TEST
42
+ self.pre_nms_topk_test = cfg.MODEL.FCOS.PRE_NMS_TOPK_TEST
43
+ self.post_nms_topk_test = cfg.MODEL.FCOS.POST_NMS_TOPK_TEST
44
+ self.nms_thresh = cfg.MODEL.FCOS.NMS_TH
45
+ self.thresh_with_ctr = cfg.MODEL.FCOS.THRESH_WITH_CTR
46
+
47
+ self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES
48
+ self.strides = cfg.MODEL.FCOS.FPN_STRIDES
49
+
50
+ self.sizes_of_interest = cfg.MODEL.FCOS.SIZES_OF_INTEREST
51
+
52
+ def _transpose(self, training_targets, num_loc_list):
53
+ '''
54
+ This function is used to transpose image first training targets to level first ones
55
+ :return: level first training targets
56
+ '''
57
+ for im_i in range(len(training_targets)):
58
+ training_targets[im_i] = torch.split(
59
+ training_targets[im_i], num_loc_list, dim=0
60
+ )
61
+
62
+ targets_level_first = []
63
+ for targets_per_level in zip(*training_targets):
64
+ targets_level_first.append(
65
+ torch.cat(targets_per_level, dim=0)
66
+ )
67
+ return targets_level_first
68
+
69
+ def _get_ground_truth(self, locations, gt_instances):
70
+ num_loc_list = [len(loc) for loc in locations]
71
+
72
+ # compute locations to size ranges
73
+ loc_to_size_range = []
74
+ for l, loc_per_level in enumerate(locations):
75
+ loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
76
+ loc_to_size_range.append(
77
+ loc_to_size_range_per_level[None].expand(num_loc_list[l], -1)
78
+ )
79
+
80
+ loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
81
+ locations = torch.cat(locations, dim=0)
82
+
83
+ training_targets = self.compute_targets_for_locations(
84
+ locations, gt_instances, loc_to_size_range, num_loc_list
85
+ )
86
+
87
+ training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
88
+ training_targets["im_inds"] = [locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))]
89
+
90
+ # transpose im first training_targets to level first ones
91
+ training_targets = {
92
+ k: self._transpose(v, num_loc_list) for k, v in training_targets.items()
93
+ }
94
+
95
+ training_targets["fpn_levels"] = [
96
+ loc.new_ones(len(loc), dtype=torch.long) * level
97
+ for level, loc in enumerate(training_targets["locations"])
98
+ ]
99
+
100
+ # we normalize reg_targets by FPN's strides here
101
+ reg_targets = training_targets["reg_targets"]
102
+ for l in range(len(reg_targets)):
103
+ reg_targets[l] = reg_targets[l] / float(self.strides[l])
104
+
105
+ return training_targets
106
+
107
+ def get_sample_region(self, boxes, strides, num_loc_list, loc_xs, loc_ys, bitmasks=None, radius=1):
108
+ # pdb.set_trace()
109
+ if bitmasks is not None:
110
+ _, h, w = bitmasks.size()
111
+
112
+ ys = torch.arange(0, h, dtype=torch.float32, device=bitmasks.device)
113
+ xs = torch.arange(0, w, dtype=torch.float32, device=bitmasks.device)
114
+
115
+ m00 = bitmasks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6)
116
+ m10 = (bitmasks * xs).sum(dim=-1).sum(dim=-1)
117
+ m01 = (bitmasks * ys[:, None]).sum(dim=-1).sum(dim=-1)
118
+ center_x = m10 / m00
119
+ center_y = m01 / m00
120
+ center_x = center_x.float()
121
+ center_y = center_y.float()
122
+ else:
123
+ center_x = boxes[..., [0, 2]].sum(dim=-1) * 0.5
124
+ center_y = boxes[..., [1, 3]].sum(dim=-1) * 0.5
125
+ # pdb.set_trace()
126
+ num_gts = boxes.shape[0]
127
+ K = len(loc_xs)
128
+ boxes = boxes[None].expand(K, num_gts, 4)
129
+ center_x = center_x[None].expand(K, num_gts)
130
+ center_y = center_y[None].expand(K, num_gts)
131
+ center_gt = boxes.new_zeros(boxes.shape)
132
+ # no gt
133
+ if center_x.numel() == 0 or center_x[..., 0].sum() == 0:
134
+ return loc_xs.new_zeros(loc_xs.shape, dtype=torch.uint8)
135
+ beg = 0
136
+ for level, num_loc in enumerate(num_loc_list):
137
+ end = beg + num_loc
138
+ stride = strides[level] * radius
139
+ xmin = center_x[beg:end] - stride
140
+ ymin = center_y[beg:end] - stride
141
+ xmax = center_x[beg:end] + stride
142
+ ymax = center_y[beg:end] + stride
143
+ # limit sample region in gt
144
+ center_gt[beg:end, :, 0] = torch.where(xmin > boxes[beg:end, :, 0], xmin, boxes[beg:end, :, 0])
145
+ center_gt[beg:end, :, 1] = torch.where(ymin > boxes[beg:end, :, 1], ymin, boxes[beg:end, :, 1])
146
+ center_gt[beg:end, :, 2] = torch.where(xmax > boxes[beg:end, :, 2], boxes[beg:end, :, 2], xmax)
147
+ center_gt[beg:end, :, 3] = torch.where(ymax > boxes[beg:end, :, 3], boxes[beg:end, :, 3], ymax)
148
+ beg = end
149
+ left = loc_xs[:, None] - center_gt[..., 0]
150
+ right = center_gt[..., 2] - loc_xs[:, None]
151
+ top = loc_ys[:, None] - center_gt[..., 1]
152
+ bottom = center_gt[..., 3] - loc_ys[:, None]
153
+ center_bbox = torch.stack((left, top, right, bottom), -1)
154
+ inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
155
+ return inside_gt_bbox_mask
156
+
157
+ def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
158
+ labels = []
159
+ reg_targets = []
160
+ target_inds = []
161
+ xs, ys = locations[:, 0], locations[:, 1]
162
+
163
+ num_targets = 0
164
+ for im_i in range(len(targets)):
165
+ targets_per_im = targets[im_i]
166
+ bboxes = targets_per_im.gt_boxes.tensor
167
+ labels_per_im = targets_per_im.gt_classes
168
+
169
+ # no gt
170
+ if bboxes.numel() == 0:
171
+ labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
172
+ reg_targets.append(locations.new_zeros((locations.size(0), 4)))
173
+ target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
174
+ continue
175
+
176
+ area = targets_per_im.gt_boxes.area()
177
+
178
+ l = xs[:, None] - bboxes[:, 0][None]
179
+ t = ys[:, None] - bboxes[:, 1][None]
180
+ r = bboxes[:, 2][None] - xs[:, None]
181
+ b = bboxes[:, 3][None] - ys[:, None]
182
+ reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
183
+
184
+ if self.center_sample:
185
+ if targets_per_im.has("gt_bitmasks_full"):
186
+ bitmasks = targets_per_im.gt_bitmasks_full
187
+ else:
188
+ bitmasks = None
189
+ is_in_boxes = self.get_sample_region(
190
+ bboxes, self.strides, num_loc_list, xs, ys,
191
+ bitmasks=bitmasks, radius=self.radius
192
+ )
193
+ else:
194
+ is_in_boxes = reg_targets_per_im.min(dim=2)[0] > 0
195
+
196
+ max_reg_targets_per_im = reg_targets_per_im.max(dim=2)[0]
197
+ # limit the regression range for each location
198
+ is_cared_in_the_level = \
199
+ (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
200
+ (max_reg_targets_per_im <= size_ranges[:, [1]])
201
+
202
+ locations_to_gt_area = area[None].repeat(len(locations), 1)
203
+ locations_to_gt_area[is_in_boxes == 0] = INF
204
+ locations_to_gt_area[is_cared_in_the_level == 0] = INF
205
+
206
+ # if there are still more than one objects for a location,
207
+ # we choose the one with minimal area
208
+ locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
209
+
210
+ reg_targets_per_im = reg_targets_per_im[range(len(locations)), locations_to_gt_inds]
211
+ target_inds_per_im = locations_to_gt_inds + num_targets
212
+ num_targets += len(targets_per_im)
213
+
214
+ labels_per_im = labels_per_im[locations_to_gt_inds]
215
+ labels_per_im[locations_to_min_area == INF] = self.num_classes
216
+
217
+ labels.append(labels_per_im)
218
+ reg_targets.append(reg_targets_per_im)
219
+ target_inds.append(target_inds_per_im)
220
+
221
+ return {
222
+ "labels": labels,
223
+ "reg_targets": reg_targets,
224
+ "target_inds": target_inds
225
+ }
226
+
227
+ def losses(self, logits_pred, reg_pred, ctrness_pred, locations, gt_instances, top_feats=None):
228
+ """
229
+ Return the losses from a set of FCOS predictions and their associated ground-truth.
230
+
231
+ Returns:
232
+ dict[loss name -> loss value]: A dict mapping from loss name to loss value.
233
+ """
234
+
235
+ training_targets = self._get_ground_truth(locations, gt_instances)
236
+
237
+ # Collect all logits and regression predictions over feature maps
238
+ # and images to arrive at the same shape as the labels and targets
239
+ # The final ordering is L, N, H, W from slowest to fastest axis.
240
+
241
+ instances = Instances((0, 0))
242
+ instances.labels = cat([
243
+ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
244
+ x.reshape(-1) for x in training_targets["labels"]
245
+ ], dim=0)
246
+ instances.gt_inds = cat([
247
+ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
248
+ x.reshape(-1) for x in training_targets["target_inds"]
249
+ ], dim=0)
250
+ instances.im_inds = cat([
251
+ x.reshape(-1) for x in training_targets["im_inds"]
252
+ ], dim=0)
253
+ instances.reg_targets = cat([
254
+ # Reshape: (N, Hi, Wi, 4) -> (N*Hi*Wi, 4)
255
+ x.reshape(-1, 4) for x in training_targets["reg_targets"]
256
+ ], dim=0,)
257
+ instances.locations = cat([
258
+ x.reshape(-1, 2) for x in training_targets["locations"]
259
+ ], dim=0)
260
+ instances.fpn_levels = cat([
261
+ x.reshape(-1) for x in training_targets["fpn_levels"]
262
+ ], dim=0)
263
+
264
+ instances.logits_pred = cat([
265
+ # Reshape: (N, C, Hi, Wi) -> (N, Hi, Wi, C) -> (N*Hi*Wi, C)
266
+ x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits_pred
267
+ ], dim=0,)
268
+ instances.reg_pred = cat([
269
+ # Reshape: (N, B, Hi, Wi) -> (N, Hi, Wi, B) -> (N*Hi*Wi, B)
270
+ x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred
271
+ ], dim=0,)
272
+ instances.ctrness_pred = cat([
273
+ # Reshape: (N, 1, Hi, Wi) -> (N*Hi*Wi,)
274
+ x.permute(0, 2, 3, 1).reshape(-1) for x in ctrness_pred
275
+ ], dim=0,)
276
+
277
+ if len(top_feats) > 0:
278
+ instances.top_feats = cat([
279
+ # Reshape: (N, -1, Hi, Wi) -> (N*Hi*Wi, -1)
280
+ x.permute(0, 2, 3, 1).reshape(-1, x.size(1)) for x in top_feats
281
+ ], dim=0,)
282
+
283
+ return self.fcos_losses(instances)
284
+
285
+ def fcos_losses(self, instances):
286
+ num_classes = instances.logits_pred.size(1)
287
+ assert num_classes == self.num_classes
288
+
289
+ labels = instances.labels.flatten()
290
+
291
+ pos_inds = torch.nonzero(labels != num_classes).squeeze(1)
292
+ num_pos_local = pos_inds.numel()
293
+ num_gpus = get_world_size()
294
+ total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
295
+ num_pos_avg = max(total_num_pos / num_gpus, 1.0)
296
+
297
+ # prepare one_hot
298
+ class_target = torch.zeros_like(instances.logits_pred)
299
+ class_target[pos_inds, labels[pos_inds]] = 1
300
+
301
+ class_loss = sigmoid_focal_loss_jit(
302
+ instances.logits_pred,
303
+ class_target,
304
+ alpha=self.focal_loss_alpha,
305
+ gamma=self.focal_loss_gamma,
306
+ reduction="sum",
307
+ ) / num_pos_avg
308
+
309
+ instances = instances[pos_inds]
310
+ instances.pos_inds = pos_inds
311
+
312
+ ctrness_targets = compute_ctrness_targets(instances.reg_targets)
313
+ ctrness_targets_sum = ctrness_targets.sum()
314
+ loss_denorm = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
315
+ instances.gt_ctrs = ctrness_targets
316
+
317
+ if pos_inds.numel() > 0:
318
+ reg_loss = self.loc_loss_func(
319
+ instances.reg_pred,
320
+ instances.reg_targets,
321
+ ctrness_targets
322
+ ) / loss_denorm
323
+
324
+ ctrness_loss = F.binary_cross_entropy_with_logits(
325
+ instances.ctrness_pred,
326
+ ctrness_targets,
327
+ reduction="sum"
328
+ ) / num_pos_avg
329
+ else:
330
+ reg_loss = instances.reg_pred.sum() * 0
331
+ ctrness_loss = instances.ctrness_pred.sum() * 0
332
+
333
+ losses = {
334
+ "loss_fcos_cls": class_loss,
335
+ "loss_fcos_loc": reg_loss,
336
+ "loss_fcos_ctr": ctrness_loss
337
+ }
338
+ extras = {
339
+ "instances": instances,
340
+ "loss_denorm": loss_denorm
341
+ }
342
+ return extras, losses
343
+
344
+ def predict_proposals(
345
+ self, logits_pred, reg_pred, ctrness_pred,
346
+ locations, image_sizes, top_feats=None
347
+ ):
348
+ if self.training:
349
+ self.pre_nms_thresh = self.pre_nms_thresh_train
350
+ self.pre_nms_topk = self.pre_nms_topk_train
351
+ self.post_nms_topk = self.post_nms_topk_train
352
+ else:
353
+ self.pre_nms_thresh = self.pre_nms_thresh_test
354
+ self.pre_nms_topk = self.pre_nms_topk_test
355
+ self.post_nms_topk = self.post_nms_topk_test
356
+
357
+ sampled_boxes = []
358
+
359
+ bundle = {
360
+ "l": locations, "o": logits_pred,
361
+ "r": reg_pred, "c": ctrness_pred,
362
+ "s": self.strides,
363
+ }
364
+
365
+ if len(top_feats) > 0:
366
+ bundle["t"] = top_feats
367
+
368
+ for i, per_bundle in enumerate(zip(*bundle.values())):
369
+ # get per-level bundle
370
+ per_bundle = dict(zip(bundle.keys(), per_bundle))
371
+ # recall that during training, we normalize regression targets with FPN's stride.
372
+ # we denormalize them here.
373
+ l = per_bundle["l"]
374
+ o = per_bundle["o"]
375
+ r = per_bundle["r"] * per_bundle["s"]
376
+ c = per_bundle["c"]
377
+ t = per_bundle["t"] if "t" in bundle else None
378
+
379
+ sampled_boxes.append(
380
+ self.forward_for_single_feature_map(
381
+ l, o, r, c, image_sizes, t
382
+ )
383
+ )
384
+
385
+ for per_im_sampled_boxes in sampled_boxes[-1]:
386
+ per_im_sampled_boxes.fpn_levels = l.new_ones(
387
+ len(per_im_sampled_boxes), dtype=torch.long
388
+ ) * i
389
+
390
+ boxlists = list(zip(*sampled_boxes))
391
+ boxlists = [Instances.cat(boxlist) for boxlist in boxlists]
392
+ boxlists = self.select_over_all_levels(boxlists)
393
+
394
+ return boxlists
395
+
396
+ def forward_for_single_feature_map(
397
+ self, locations, logits_pred, reg_pred,
398
+ ctrness_pred, image_sizes, top_feat=None
399
+ ):
400
+ N, C, H, W = logits_pred.shape
401
+
402
+ # put in the same format as locations
403
+ logits_pred = logits_pred.view(N, C, H, W).permute(0, 2, 3, 1)
404
+ logits_pred = logits_pred.reshape(N, -1, C).sigmoid()
405
+ box_regression = reg_pred.view(N, 4, H, W).permute(0, 2, 3, 1)
406
+ box_regression = box_regression.reshape(N, -1, 4)
407
+ ctrness_pred = ctrness_pred.view(N, 1, H, W).permute(0, 2, 3, 1)
408
+ ctrness_pred = ctrness_pred.reshape(N, -1).sigmoid()
409
+ if top_feat is not None:
410
+ top_feat = top_feat.view(N, -1, H, W).permute(0, 2, 3, 1)
411
+ top_feat = top_feat.reshape(N, H * W, -1)
412
+
413
+ # if self.thresh_with_ctr is True, we multiply the classification
414
+ # scores with centerness scores before applying the threshold.
415
+ if self.thresh_with_ctr:
416
+ logits_pred = logits_pred * ctrness_pred[:, :, None]
417
+ candidate_inds = logits_pred > self.pre_nms_thresh
418
+ pre_nms_top_n = candidate_inds.view(N, -1).sum(1)
419
+ pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_topk)
420
+
421
+ if not self.thresh_with_ctr:
422
+ logits_pred = logits_pred * ctrness_pred[:, :, None]
423
+
424
+ results = []
425
+ for i in range(N):
426
+ per_box_cls = logits_pred[i]
427
+ per_candidate_inds = candidate_inds[i]
428
+ per_box_cls = per_box_cls[per_candidate_inds]
429
+
430
+ per_candidate_nonzeros = per_candidate_inds.nonzero()
431
+ per_box_loc = per_candidate_nonzeros[:, 0]
432
+ per_class = per_candidate_nonzeros[:, 1]
433
+
434
+ per_box_regression = box_regression[i]
435
+ per_box_regression = per_box_regression[per_box_loc]
436
+ per_locations = locations[per_box_loc]
437
+ if top_feat is not None:
438
+ per_top_feat = top_feat[i]
439
+ per_top_feat = per_top_feat[per_box_loc]
440
+
441
+ per_pre_nms_top_n = pre_nms_top_n[i]
442
+
443
+ if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():
444
+ per_box_cls, top_k_indices = \
445
+ per_box_cls.topk(per_pre_nms_top_n, sorted=False)
446
+ per_class = per_class[top_k_indices]
447
+ per_box_regression = per_box_regression[top_k_indices]
448
+ per_locations = per_locations[top_k_indices]
449
+ if top_feat is not None:
450
+ per_top_feat = per_top_feat[top_k_indices]
451
+
452
+ detections = torch.stack([
453
+ per_locations[:, 0] - per_box_regression[:, 0],
454
+ per_locations[:, 1] - per_box_regression[:, 1],
455
+ per_locations[:, 0] + per_box_regression[:, 2],
456
+ per_locations[:, 1] + per_box_regression[:, 3],
457
+ ], dim=1)
458
+
459
+ boxlist = Instances(image_sizes[i])
460
+ boxlist.pred_boxes = Boxes(detections)
461
+ boxlist.scores = torch.sqrt(per_box_cls)
462
+ boxlist.pred_classes = per_class
463
+ boxlist.locations = per_locations
464
+ if top_feat is not None:
465
+ boxlist.top_feat = per_top_feat
466
+ results.append(boxlist)
467
+
468
+ return results
469
+
470
+ def select_over_all_levels(self, boxlists):
471
+ num_images = len(boxlists)
472
+ results = []
473
+ for i in range(num_images):
474
+ # multiclass nms
475
+ result = ml_nms(boxlists[i], self.nms_thresh)
476
+ number_of_detections = len(result)
477
+
478
+ # Limit to max_per_image detections **over all classes**
479
+ if number_of_detections > self.post_nms_topk > 0:
480
+ cls_scores = result.scores
481
+ image_thresh, _ = torch.kthvalue(
482
+ cls_scores.cpu(),
483
+ number_of_detections - self.post_nms_topk + 1
484
+ )
485
+ keep = cls_scores >= image_thresh.item()
486
+ keep = torch.nonzero(keep).squeeze(1)
487
+ result = result[keep]
488
+ results.append(result)
489
+ return results
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/tower.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, Dict
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ from detectron2.layers import ShapeSpec
8
+
9
+ from .layers import DFConv2d, IOULoss
10
+
11
+ class Scale(nn.Module):
12
+ def __init__(self, init_value=1.0):
13
+ super(Scale, self).__init__()
14
+ self.scale = nn.Parameter(torch.FloatTensor([init_value]))
15
+
16
+ def forward(self, input):
17
+ return input * self.scale
18
+
19
+ class FCOSHead(nn.Module):
20
+ def __init__(self, cfg, input_shape: List[ShapeSpec]):
21
+ """
22
+ Arguments:
23
+ in_channels (int): number of channels of the input feature
24
+ """
25
+ super().__init__()
26
+ self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES
27
+ self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES
28
+ head_configs = {"cls": (cfg.MODEL.FCOS.NUM_CLS_CONVS, False),
29
+ "bbox": (cfg.MODEL.FCOS.NUM_BOX_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE),
30
+ "share": (cfg.MODEL.FCOS.NUM_SHARE_CONVS, cfg.MODEL.FCOS.USE_DEFORMABLE)}
31
+ norm = None if cfg.MODEL.FCOS.NORM == "none" else cfg.MODEL.FCOS.NORM
32
+
33
+ in_channels = [s.channels for s in input_shape]
34
+ assert len(set(in_channels)) == 1, "Each level must have the same channel!"
35
+ in_channels = in_channels[0]
36
+
37
+ self.in_channels_to_top_module = in_channels
38
+
39
+ for head in head_configs:
40
+ tower = []
41
+ num_convs, use_deformable = head_configs[head]
42
+ if use_deformable:
43
+ conv_func = DFConv2d
44
+ else:
45
+ conv_func = nn.Conv2d
46
+ for i in range(num_convs):
47
+ tower.append(conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True))
48
+ if norm == "GN":
49
+ tower.append(nn.GroupNorm(32, in_channels))
50
+ tower.append(nn.ReLU())
51
+ self.add_module('{}_tower'.format(head), nn.Sequential(*tower))
52
+
53
+ self.cls_logits = nn.Conv2d(in_channels, self.num_classes, kernel_size=3, stride=1, padding=1)
54
+ self.bbox_pred = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1, bias=False)
55
+ self.ctrness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=False)
56
+
57
+ if cfg.MODEL.FCOS.USE_SCALE:
58
+ self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in self.fpn_strides])
59
+ else:
60
+ self.scales = None
61
+
62
+ for modules in [self.cls_tower, self.bbox_tower, self.share_tower, self.cls_logits]:
63
+ for l in modules.modules():
64
+ if isinstance(l, nn.Conv2d):
65
+ torch.nn.init.normal_(l.weight, std=0.01)
66
+ torch.nn.init.constant_(l.bias, 0)
67
+
68
+ for modules in [self.bbox_pred, self.ctrness]:
69
+ for l in modules.modules():
70
+ if isinstance(l, nn.Conv2d):
71
+ torch.nn.init.normal_(l.weight, std=0.01)
72
+
73
+ # initialize the bias for focal loss
74
+ prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
75
+ bias_value = -math.log((1 - prior_prob) / prior_prob)
76
+ torch.nn.init.constant_(self.cls_logits.bias, bias_value)
77
+
78
+ def forward(self, x, top_module=None):
79
+ logits = []
80
+ bbox_reg = []
81
+ ctrness = []
82
+ top_feats = []
83
+ bbox_towers = []
84
+ for l, feature in enumerate(x):
85
+ feature = self.share_tower(feature)
86
+ cls_tower = self.cls_tower(feature)
87
+ bbox_tower = self.bbox_tower(feature)
88
+
89
+ logits.append(self.cls_logits(cls_tower))
90
+ ctrness.append(self.ctrness(bbox_tower))
91
+ reg = self.bbox_pred(bbox_tower)
92
+ if self.scales is not None:
93
+ reg = self.scales[l](reg)
94
+ # Note that we use relu, as in the improved FCOS, instead of exp.
95
+ bbox_reg.append(F.relu(reg))
96
+
97
+ if top_module is not None:
98
+ top_feats.append(top_module(bbox_tower))
99
+
100
+ return logits, bbox_reg, ctrness, bbox_towers, top_feats
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .comm import reduce_sum
2
+ from .measures import *
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/comm.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import torch.distributed as dist
4
+ from detectron2.utils.comm import get_world_size
5
+
6
+
7
+ def reduce_sum(tensor):
8
+ world_size = get_world_size()
9
+ if world_size < 2:
10
+ return tensor
11
+ tensor = tensor.clone()
12
+ dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
13
+ return tensor
14
+
15
+ def aligned_bilinear(tensor, factor):
16
+ assert tensor.dim() == 4
17
+ assert factor >= 1
18
+ assert int(factor) == factor
19
+
20
+ if factor == 1:
21
+ return tensor
22
+
23
+ h, w = tensor.size()[2:]
24
+ tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
25
+ oh = factor * h + 1
26
+ ow = factor * w + 1
27
+ tensor = F.interpolate(
28
+ tensor, size=(oh, ow),
29
+ mode='bilinear',
30
+ align_corners=True
31
+ )
32
+ tensor = F.pad(
33
+ tensor, pad=(factor // 2, 0, factor // 2, 0),
34
+ mode="replicate"
35
+ )
36
+ return tensor[:, :, :oh - 1, :ow - 1]
37
+
38
+
39
+ def compute_locations(h, w, stride, device):
40
+ shifts_x = torch.arange(
41
+ 0, w * stride, step=stride,
42
+ dtype=torch.float32, device=device
43
+ )
44
+ shifts_y = torch.arange(
45
+ 0, h * stride, step=stride,
46
+ dtype=torch.float32, device=device
47
+ )
48
+ shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
49
+ shift_x = shift_x.reshape(-1)
50
+ shift_y = shift_y.reshape(-1)
51
+ locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
52
+ return locations
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/det_head/utils/measures.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ # Adapted from https://github.com/ShichenLiu/CondenseNet/blob/master/utils.py
3
+ from __future__ import absolute_import
4
+ from __future__ import unicode_literals
5
+ from __future__ import print_function
6
+ from __future__ import division
7
+
8
+ import operator
9
+
10
+ from functools import reduce
11
+
12
+
13
+ def get_num_gen(gen):
14
+ return sum(1 for x in gen)
15
+
16
+ def is_pruned(layer):
17
+ try:
18
+ layer.mask
19
+ return True
20
+ except AttributeError:
21
+ return False
22
+
23
+
24
+ def is_leaf(model):
25
+ return get_num_gen(model.children()) == 0
26
+
27
+
28
+ def get_layer_info(layer):
29
+ layer_str = str(layer)
30
+ type_name = layer_str[:layer_str.find('(')].strip()
31
+ return type_name
32
+
33
+
34
+ def get_layer_param(model):
35
+ return sum([reduce(operator.mul, i.size(), 1) for i in model.parameters()])
36
+
37
+
38
+ ### The input batch size should be 1 to call this function
39
+ def measure_layer(layer, *args):
40
+ global count_ops, count_params
41
+
42
+ for x in args:
43
+ delta_ops = 0
44
+ delta_params = 0
45
+ multi_add = 1
46
+ type_name = get_layer_info(layer)
47
+
48
+ ### ops_conv
49
+ if type_name in ['Conv2d']:
50
+ out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0] - layer.kernel_size[0]) /
51
+ layer.stride[0] + 1)
52
+ out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1] - layer.kernel_size[1]) /
53
+ layer.stride[1] + 1)
54
+ delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
55
+ delta_params = get_layer_param(layer)
56
+
57
+ elif type_name in ['ConvTranspose2d']:
58
+ _, _, in_h, in_w = x.size()
59
+ out_h = int((in_h-1)*layer.stride[0] - 2 * layer.padding[0] + layer.kernel_size[0] + layer.output_padding[0])
60
+ out_w = int((in_w-1)*layer.stride[1] - 2 * layer.padding[1] + layer.kernel_size[1] + layer.output_padding[1])
61
+ delta_ops = layer.in_channels * layer.out_channels * layer.kernel_size[0] * \
62
+ layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
63
+ delta_params = get_layer_param(layer)
64
+
65
+ ### ops_learned_conv
66
+ elif type_name in ['LearnedGroupConv']:
67
+ measure_layer(layer.relu, x)
68
+ measure_layer(layer.norm, x)
69
+ conv = layer.conv
70
+ out_h = int((x.size()[2] + 2 * conv.padding[0] - conv.kernel_size[0]) /
71
+ conv.stride[0] + 1)
72
+ out_w = int((x.size()[3] + 2 * conv.padding[1] - conv.kernel_size[1]) /
73
+ conv.stride[1] + 1)
74
+ delta_ops = conv.in_channels * conv.out_channels * conv.kernel_size[0] * conv.kernel_size[1] * out_h * out_w / layer.condense_factor * multi_add
75
+ delta_params = get_layer_param(conv) / layer.condense_factor
76
+
77
+ ### ops_nonlinearity
78
+ elif type_name in ['ReLU', 'ReLU6']:
79
+ delta_ops = x.numel()
80
+ delta_params = get_layer_param(layer)
81
+
82
+ ### ops_pooling
83
+ elif type_name in ['AvgPool2d', 'MaxPool2d']:
84
+ in_w = x.size()[2]
85
+ kernel_ops = layer.kernel_size * layer.kernel_size
86
+ out_w = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
87
+ out_h = int((in_w + 2 * layer.padding - layer.kernel_size) / layer.stride + 1)
88
+ delta_ops = x.size()[0] * x.size()[1] * out_w * out_h * kernel_ops
89
+ delta_params = get_layer_param(layer)
90
+
91
+ elif type_name in ['LastLevelMaxPool']:
92
+ pass
93
+
94
+ elif type_name in ['AdaptiveAvgPool2d']:
95
+ delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
96
+ delta_params = get_layer_param(layer)
97
+
98
+ elif type_name in ['ZeroPad2d', 'RetinaNetPostProcessor']:
99
+ pass
100
+ #delta_ops = x.size()[0] * x.size()[1] * x.size()[2] * x.size()[3]
101
+ #delta_params = get_layer_param(layer)
102
+
103
+ ### ops_linear
104
+ elif type_name in ['Linear']:
105
+ weight_ops = layer.weight.numel() * multi_add
106
+ bias_ops = layer.bias.numel()
107
+ delta_ops = x.size()[0] * (weight_ops + bias_ops)
108
+ delta_params = get_layer_param(layer)
109
+
110
+ ### ops_nothing
111
+ elif type_name in ['BatchNorm2d', 'Dropout2d', 'DropChannel', 'Dropout', 'FrozenBatchNorm2d', 'GroupNorm']:
112
+ delta_params = get_layer_param(layer)
113
+
114
+ elif type_name in ['SumTwo']:
115
+ delta_ops = x.numel()
116
+
117
+ elif type_name in ['AggregateCell']:
118
+ if not layer.pre_transform:
119
+ delta_ops = 2 * x.numel() # twice for each input
120
+ else:
121
+ measure_layer(layer.branch_1, x)
122
+ measure_layer(layer.branch_2, x)
123
+ delta_params = get_layer_param(layer)
124
+
125
+ elif type_name in ['Identity', 'Zero']:
126
+ pass
127
+
128
+ elif type_name in ['Scale']:
129
+ delta_params = get_layer_param(layer)
130
+ delta_ops = x.numel()
131
+
132
+ elif type_name in ['FCOSPostProcessor', 'RPNPostProcessor', 'KeypointPostProcessor',
133
+ 'ROIAlign', 'PostProcessor', 'KeypointRCNNPredictor',
134
+ 'NaiveSyncBatchNorm', 'Upsample', 'Sequential']:
135
+ pass
136
+
137
+ elif type_name in ['DeformConv']:
138
+ # don't count bilinear
139
+ offset_conv = list(layer.parameters())[0]
140
+ delta_ops = reduce(operator.mul, offset_conv.size(), x.size()[2] * x.size()[3])
141
+ out_h = int((x.size()[2] + 2 * layer.padding[0] / layer.dilation[0]
142
+ - layer.kernel_size[0]) / layer.stride[0] + 1)
143
+ out_w = int((x.size()[3] + 2 * layer.padding[1] / layer.dilation[1]
144
+ - layer.kernel_size[1]) / layer.stride[1] + 1)
145
+ delta_ops += layer.in_channels * layer.out_channels * layer.kernel_size[0] * layer.kernel_size[1] * out_h * out_w / layer.groups * multi_add
146
+ delta_params = get_layer_param(layer)
147
+
148
+ ### unknown layer type
149
+ else:
150
+ raise TypeError('unknown layer type: %s' % type_name)
151
+
152
+ count_ops += delta_ops
153
+ count_params += delta_params
154
+ return
155
+
156
+
157
+ def measure_model(model, x):
158
+ global count_ops, count_params
159
+ count_ops = 0
160
+ count_params = 0
161
+
162
+ def should_measure(x):
163
+ return is_leaf(x) or is_pruned(x)
164
+
165
+ def modify_forward(model):
166
+ for child in model.children():
167
+ if should_measure(child):
168
+ def new_forward(m):
169
+ def lambda_forward(*args):
170
+ measure_layer(m, *args)
171
+ return m.old_forward(*args)
172
+ return lambda_forward
173
+ child.old_forward = child.forward
174
+ child.forward = new_forward(child)
175
+ else:
176
+ modify_forward(child)
177
+
178
+ def restore_forward(model):
179
+ for child in model.children():
180
+ # leaf node
181
+ if is_leaf(child) and hasattr(child, 'old_forward'):
182
+ child.forward = child.old_forward
183
+ child.old_forward = None
184
+ else:
185
+ restore_forward(child)
186
+
187
+ modify_forward(model)
188
+ out = model.forward(x)
189
+ restore_forward(model)
190
+
191
+ return out, count_ops, count_params
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/__init__.py ADDED
File without changes
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/evaluator/entity_evaluation.py ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2
+ import contextlib
3
+ import copy
4
+ import io
5
+ import itertools
6
+ import json
7
+ import logging
8
+ import numpy as np
9
+ import os
10
+ import pickle
11
+ from collections import OrderedDict
12
+ import pycocotools.mask as mask_util
13
+ import torch
14
+ from fvcore.common.file_io import PathManager
15
+ from pycocotools.coco import COCO
16
+ from tabulate import tabulate
17
+
18
+ import detectron2.utils.comm as comm
19
+ from detectron2.data import MetadataCatalog
20
+ from detectron2.data.datasets.coco import convert_to_coco_json
21
+ from detectron2.evaluation.evaluator import DatasetEvaluator
22
+ from detectron2.evaluation.fast_eval_api import COCOeval_opt as COCOeval
23
+ from detectron2.structures import Boxes, BoxMode, pairwise_iou
24
+ from detectron2.utils.logger import create_small_table
25
+ import pdb
26
+
27
+ class COCOEvaluator_ClassAgnostic(DatasetEvaluator):
28
+ """
29
+ Evaluate AR for object proposals, AP for instance detection/segmentation, AP
30
+ for keypoint detection outputs using COCO's metrics.
31
+ See http://cocodataset.org/#detection-eval and
32
+ http://cocodataset.org/#keypoints-eval to understand its metrics.
33
+
34
+ In addition to COCO, this evaluator is able to support any bounding box detection,
35
+ instance segmentation, or keypoint detection dataset.
36
+ """
37
+
38
+ def __init__(self, dataset_name, cfg, distributed, output_dir=None):
39
+ """
40
+ Args:
41
+ dataset_name (str): name of the dataset to be evaluated.
42
+ It must have either the following corresponding metadata:
43
+
44
+ "json_file": the path to the COCO format annotation
45
+
46
+ Or it must be in detectron2's standard dataset format
47
+ so it can be converted to COCO format automatically.
48
+ cfg (CfgNode): config instance
49
+ distributed (True): if True, will collect results from all ranks and run evaluation
50
+ in the main process.
51
+ Otherwise, will evaluate the results in the current process.
52
+ output_dir (str): optional, an output directory to dump all
53
+ results predicted on the dataset. The dump contains two files:
54
+
55
+ 1. "instance_predictions.pth" a file in torch serialization
56
+ format that contains all the raw original predictions.
57
+ 2. "coco_instances_results.json" a json file in COCO's result
58
+ format.
59
+ """
60
+ self._tasks = self._tasks_from_config(cfg)
61
+ self._distributed = distributed
62
+ self._output_dir = output_dir
63
+
64
+ self._cpu_device = torch.device("cpu")
65
+ self._logger = logging.getLogger(__name__)
66
+
67
+ self._metadata = MetadataCatalog.get(dataset_name)
68
+ if not hasattr(self._metadata, "json_file"):
69
+ self._logger.info(
70
+ f"'{dataset_name}' is not registered by `register_coco_instances`."
71
+ " Therefore trying to convert it to COCO format ..."
72
+ )
73
+
74
+ cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json")
75
+ self._metadata.json_file = cache_path
76
+ convert_to_coco_json(dataset_name, cache_path)
77
+
78
+ # pdb.set_trace()
79
+ # if self._metadata.select:
80
+ # self._metadata.json_file = os.path.join("individual", self._metadata.json_file.split(".")[0]+"_{}.json".format(self._metadata.select))
81
+ json_file = PathManager.get_local_path(self._metadata.json_file)
82
+ with contextlib.redirect_stdout(io.StringIO()):
83
+ self._coco_api = COCO(json_file, cfg.TEST.CLASS_AGNOSTIC)
84
+
85
+ self._kpt_oks_sigmas = cfg.TEST.KEYPOINT_OKS_SIGMAS
86
+ # Test set json files do not contain annotations (evaluation must be
87
+ # performed using the COCO evaluation server).
88
+ self._do_evaluation = "annotations" in self._coco_api.dataset
89
+
90
+ def reset(self):
91
+ self._predictions = []
92
+
93
+ def _tasks_from_config(self, cfg):
94
+ """
95
+ Returns:
96
+ tuple[str]: tasks that can be evaluated under the given configuration.
97
+ """
98
+ tasks = ("bbox",)
99
+ if cfg.MODEL.MASK_ON:
100
+ tasks = tasks + ("segm",)
101
+ if cfg.MODEL.KEYPOINT_ON:
102
+ tasks = tasks + ("keypoints",)
103
+ return tasks
104
+
105
+ def process(self, inputs, outputs):
106
+ """
107
+ Args:
108
+ inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
109
+ It is a list of dict. Each dict corresponds to an image and
110
+ contains keys like "height", "width", "file_name", "image_id".
111
+ outputs: the outputs of a COCO model. It is a list of dicts with key
112
+ "instances" that contains :class:`Instances`.
113
+ """
114
+ for input, output in zip(inputs, outputs):
115
+ prediction = {"image_id": input["image_id"]}
116
+
117
+ # TODO this is ugly
118
+ if "instances" in output:
119
+ instances = output["instances"].to(self._cpu_device)
120
+ prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
121
+ if "proposals" in output:
122
+ prediction["proposals"] = output["proposals"].to(self._cpu_device)
123
+ self._predictions.append(prediction)
124
+
125
+ def evaluate(self):
126
+ if self._distributed:
127
+ comm.synchronize()
128
+ predictions = comm.gather(self._predictions, dst=0)
129
+ predictions = list(itertools.chain(*predictions))
130
+
131
+ if not comm.is_main_process():
132
+ return {}
133
+ else:
134
+ predictions = self._predictions
135
+
136
+ if len(predictions) == 0:
137
+ self._logger.warning("[COCOEvaluator] Did not receive valid predictions.")
138
+ return {}
139
+
140
+ if self._output_dir:
141
+ PathManager.mkdirs(self._output_dir)
142
+ file_path = os.path.join(self._output_dir, "instances_predictions.pth")
143
+ with PathManager.open(file_path, "wb") as f:
144
+ torch.save(predictions, f)
145
+
146
+ self._results = OrderedDict()
147
+ if "proposals" in predictions[0]:
148
+ self._eval_box_proposals(predictions)
149
+ if "instances" in predictions[0]:
150
+ self._eval_predictions(set(self._tasks), predictions)
151
+ # Copy so the caller can do whatever with results
152
+ return copy.deepcopy(self._results)
153
+
154
+ def _eval_predictions(self, tasks, predictions):
155
+ """
156
+ Evaluate predictions on the given tasks.
157
+ Fill self._results with the metrics of the tasks.
158
+ """
159
+ self._logger.info("Preparing results for COCO format ...")
160
+ coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
161
+
162
+ # unmap the category ids for COCO
163
+ # if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
164
+ # reverse_id_mapping = {
165
+ # v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items()
166
+ # }
167
+ for result in coco_results:
168
+ result["category_id"] = 1
169
+
170
+ if self._output_dir:
171
+ file_path = os.path.join(self._output_dir, "coco_instances_results.json")
172
+ self._logger.info("Saving results to {}".format(file_path))
173
+ with PathManager.open(file_path, "w") as f:
174
+ f.write(json.dumps(coco_results))
175
+ f.flush()
176
+
177
+ if not self._do_evaluation:
178
+ self._logger.info("Annotations are not available for evaluation.")
179
+ return
180
+
181
+ self._logger.info("Evaluating predictions ...")
182
+ if "segmentation" in coco_results[0]:
183
+ tasks = ["bbox", "segm"]
184
+ else:
185
+ task = ["bbox"]
186
+ for task in sorted(tasks):
187
+ coco_eval = (
188
+ _evaluate_predictions_on_coco(
189
+ self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas
190
+ )
191
+ if len(coco_results) > 0
192
+ else None # cocoapi does not handle empty results very well
193
+ )
194
+
195
+ res = self._derive_coco_results(
196
+ coco_eval, task
197
+ )
198
+ self._results[task] = res
199
+
200
+ def _eval_box_proposals(self, predictions):
201
+ """
202
+ Evaluate the box proposals in predictions.
203
+ Fill self._results with the metrics for "box_proposals" task.
204
+ """
205
+ if self._output_dir:
206
+ # Saving generated box proposals to file.
207
+ # Predicted box_proposals are in XYXY_ABS mode.
208
+ bbox_mode = BoxMode.XYXY_ABS.value
209
+ ids, boxes, objectness_logits = [], [], []
210
+ for prediction in predictions:
211
+ ids.append(prediction["image_id"])
212
+ boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy())
213
+ objectness_logits.append(prediction["proposals"].objectness_logits.numpy())
214
+
215
+ proposal_data = {
216
+ "boxes": boxes,
217
+ "objectness_logits": objectness_logits,
218
+ "ids": ids,
219
+ "bbox_mode": bbox_mode,
220
+ }
221
+ with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f:
222
+ pickle.dump(proposal_data, f)
223
+
224
+ if not self._do_evaluation:
225
+ self._logger.info("Annotations are not available for evaluation.")
226
+ return
227
+
228
+ self._logger.info("Evaluating bbox proposals ...")
229
+ res = {}
230
+ areas = {"all": "", "small": "s", "medium": "m", "large": "l"}
231
+ for limit in [100, 1000]:
232
+ for area, suffix in areas.items():
233
+ stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit)
234
+ key = "AR{}@{:d}".format(suffix, limit)
235
+ res[key] = float(stats["ar"].item() * 100)
236
+ self._logger.info("Proposal metrics: \n" + create_small_table(res))
237
+ self._results["box_proposals"] = res
238
+
239
+ def _derive_coco_results(self, coco_eval, iou_type, class_names=None):
240
+ """
241
+ Derive the desired score numbers from summarized COCOeval.
242
+
243
+ Args:
244
+ coco_eval (None or COCOEval): None represents no predictions from model.
245
+ iou_type (str):
246
+ class_names (None or list[str]): if provided, will use it to predict
247
+ per-category AP.
248
+
249
+ Returns:
250
+ a dict of {metric name: score}
251
+ """
252
+
253
+ metrics = {
254
+ "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
255
+ "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"],
256
+ "keypoints": ["AP", "AP50", "AP75", "APm", "APl"],
257
+ }[iou_type]
258
+
259
+ if coco_eval is None:
260
+ self._logger.warn("No predictions from the model!")
261
+ return {metric: float("nan") for metric in metrics}
262
+
263
+ # the standard metrics
264
+ results = {
265
+ metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan")
266
+ for idx, metric in enumerate(metrics)
267
+ }
268
+ self._logger.info(
269
+ "Evaluation results for {}: \n".format(iou_type) + create_small_table(results)
270
+ )
271
+ if not np.isfinite(sum(results.values())):
272
+ self._logger.info("Some metrics cannot be computed and is shown as NaN.")
273
+
274
+ if class_names is None or len(class_names) <= 1:
275
+ return results
276
+ # Compute per-category AP
277
+ # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
278
+ precisions = coco_eval.eval["precision"]
279
+ # precision has dims (iou, recall, cls, area range, max dets)
280
+ assert len(class_names) == precisions.shape[2]
281
+
282
+ results_per_category = []
283
+ for idx, name in enumerate(class_names):
284
+ # area range index 0: all area ranges
285
+ # max dets index -1: typically 100 per image
286
+ precision = precisions[:, :, idx, 0, -1]
287
+ precision = precision[precision > -1]
288
+ ap = np.mean(precision) if precision.size else float("nan")
289
+ results_per_category.append(("{}".format(name), float(ap * 100)))
290
+
291
+ # tabulate it
292
+ N_COLS = min(6, len(results_per_category) * 2)
293
+ results_flatten = list(itertools.chain(*results_per_category))
294
+ results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)])
295
+ table = tabulate(
296
+ results_2d,
297
+ tablefmt="pipe",
298
+ floatfmt=".3f",
299
+ headers=["category", "AP"] * (N_COLS // 2),
300
+ numalign="left",
301
+ )
302
+ self._logger.info("Per-category {} AP: \n".format(iou_type) + table)
303
+
304
+ results.update({"AP-" + name: ap for name, ap in results_per_category})
305
+ return results
306
+
307
+
308
+ def instances_to_coco_json(instances, img_id):
309
+ """
310
+ Dump an "Instances" object to a COCO-format json that's used for evaluation.
311
+
312
+ Args:
313
+ instances (Instances):
314
+ img_id (int): the image id
315
+
316
+ Returns:
317
+ list[dict]: list of json annotations in COCO format.
318
+ """
319
+ num_instance = len(instances)
320
+ if num_instance == 0:
321
+ return []
322
+
323
+ boxes = instances.pred_boxes.tensor.numpy()
324
+ boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
325
+ boxes = boxes.tolist()
326
+ scores = instances.scores.tolist()
327
+ classes = instances.pred_classes.tolist()
328
+
329
+ has_mask = instances.has("pred_masks")
330
+ if has_mask:
331
+ # use RLE to encode the masks, because they are too large and takes memory
332
+ # since this evaluator stores outputs of the entire dataset
333
+ rles = [
334
+ mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
335
+ for mask in instances.pred_masks
336
+ ]
337
+ for rle in rles:
338
+ # "counts" is an array encoded by mask_util as a byte-stream. Python3's
339
+ # json writer which always produces strings cannot serialize a bytestream
340
+ # unless you decode it. Thankfully, utf-8 works out (which is also what
341
+ # the pycocotools/_mask.pyx does).
342
+ rle["counts"] = rle["counts"].decode("utf-8")
343
+
344
+ has_keypoints = instances.has("pred_keypoints")
345
+ if has_keypoints:
346
+ keypoints = instances.pred_keypoints
347
+
348
+ results = []
349
+ for k in range(num_instance):
350
+ result = {
351
+ "image_id": img_id,
352
+ "category_id": classes[k],
353
+ "bbox": boxes[k],
354
+ "score": scores[k],
355
+ }
356
+ if has_mask:
357
+ result["segmentation"] = rles[k]
358
+ if has_keypoints:
359
+ # In COCO annotations,
360
+ # keypoints coordinates are pixel indices.
361
+ # However our predictions are floating point coordinates.
362
+ # Therefore we subtract 0.5 to be consistent with the annotation format.
363
+ # This is the inverse of data loading logic in `datasets/coco.py`.
364
+ keypoints[k][:, :2] -= 0.5
365
+ result["keypoints"] = keypoints[k].flatten().tolist()
366
+ results.append(result)
367
+ return results
368
+
369
+
370
+ # inspired from Detectron:
371
+ # https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa
372
+ def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None):
373
+ """
374
+ Evaluate detection proposal recall metrics. This function is a much
375
+ faster alternative to the official COCO API recall evaluation code. However,
376
+ it produces slightly different results.
377
+ """
378
+ # Record max overlap value for each gt box
379
+ # Return vector of overlap values
380
+ areas = {
381
+ "all": 0,
382
+ "small": 1,
383
+ "medium": 2,
384
+ "large": 3,
385
+ "96-128": 4,
386
+ "128-256": 5,
387
+ "256-512": 6,
388
+ "512-inf": 7,
389
+ }
390
+ area_ranges = [
391
+ [0 ** 2, 1e5 ** 2], # all
392
+ [0 ** 2, 32 ** 2], # small
393
+ [32 ** 2, 96 ** 2], # medium
394
+ [96 ** 2, 1e5 ** 2], # large
395
+ [96 ** 2, 128 ** 2], # 96-128
396
+ [128 ** 2, 256 ** 2], # 128-256
397
+ [256 ** 2, 512 ** 2], # 256-512
398
+ [512 ** 2, 1e5 ** 2],
399
+ ] # 512-inf
400
+ assert area in areas, "Unknown area range: {}".format(area)
401
+ area_range = area_ranges[areas[area]]
402
+ gt_overlaps = []
403
+ num_pos = 0
404
+
405
+ for prediction_dict in dataset_predictions:
406
+ predictions = prediction_dict["proposals"]
407
+
408
+ # sort predictions in descending order
409
+ # TODO maybe remove this and make it explicit in the documentation
410
+ inds = predictions.objectness_logits.sort(descending=True)[1]
411
+ predictions = predictions[inds]
412
+
413
+ ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"])
414
+ anno = coco_api.loadAnns(ann_ids)
415
+ gt_boxes = [
416
+ BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
417
+ for obj in anno
418
+ if obj["iscrowd"] == 0
419
+ ]
420
+ gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes
421
+ gt_boxes = Boxes(gt_boxes)
422
+ gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0])
423
+
424
+ if len(gt_boxes) == 0 or len(predictions) == 0:
425
+ continue
426
+
427
+ valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1])
428
+ gt_boxes = gt_boxes[valid_gt_inds]
429
+
430
+ num_pos += len(gt_boxes)
431
+
432
+ if len(gt_boxes) == 0:
433
+ continue
434
+
435
+ if limit is not None and len(predictions) > limit:
436
+ predictions = predictions[:limit]
437
+
438
+ overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes)
439
+
440
+ _gt_overlaps = torch.zeros(len(gt_boxes))
441
+ for j in range(min(len(predictions), len(gt_boxes))):
442
+ # find which proposal box maximally covers each gt box
443
+ # and get the iou amount of coverage for each gt box
444
+ max_overlaps, argmax_overlaps = overlaps.max(dim=0)
445
+
446
+ # find which gt box is 'best' covered (i.e. 'best' = most iou)
447
+ gt_ovr, gt_ind = max_overlaps.max(dim=0)
448
+ assert gt_ovr >= 0
449
+ # find the proposal box that covers the best covered gt box
450
+ box_ind = argmax_overlaps[gt_ind]
451
+ # record the iou coverage of this gt box
452
+ _gt_overlaps[j] = overlaps[box_ind, gt_ind]
453
+ assert _gt_overlaps[j] == gt_ovr
454
+ # mark the proposal box and the gt box as used
455
+ overlaps[box_ind, :] = -1
456
+ overlaps[:, gt_ind] = -1
457
+
458
+ # append recorded iou coverage level
459
+ gt_overlaps.append(_gt_overlaps)
460
+ gt_overlaps = (
461
+ torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32)
462
+ )
463
+ gt_overlaps, _ = torch.sort(gt_overlaps)
464
+
465
+ if thresholds is None:
466
+ step = 0.05
467
+ thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32)
468
+ recalls = torch.zeros_like(thresholds)
469
+ # compute recall for each iou threshold
470
+ for i, t in enumerate(thresholds):
471
+ recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos)
472
+ # ar = 2 * np.trapz(recalls, thresholds)
473
+ ar = recalls.mean()
474
+ return {
475
+ "ar": ar,
476
+ "recalls": recalls,
477
+ "thresholds": thresholds,
478
+ "gt_overlaps": gt_overlaps,
479
+ "num_pos": num_pos,
480
+ }
481
+
482
+
483
+ def _evaluate_predictions_on_coco(coco_gt, coco_results, iou_type, kpt_oks_sigmas=None):
484
+ """
485
+ Evaluate the coco results using COCOEval API.
486
+ """
487
+ assert len(coco_results) > 0
488
+
489
+ if iou_type == "segm":
490
+ coco_results = copy.deepcopy(coco_results)
491
+ # When evaluating mask AP, if the results contain bbox, cocoapi will
492
+ # use the box area as the area of the instance, instead of the mask area.
493
+ # This leads to a different definition of small/medium/large.
494
+ # We remove the bbox field to let mask AP use mask area.
495
+ for c in coco_results:
496
+ c.pop("bbox", None)
497
+
498
+ coco_dt = coco_gt.loadRes(coco_results)
499
+ coco_eval = COCOeval(coco_gt, coco_dt, iou_type)
500
+
501
+ if iou_type == "keypoints":
502
+ # Use the COCO default keypoint OKS sigmas unless overrides are specified
503
+ if kpt_oks_sigmas:
504
+ assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "pycocotools is too old!"
505
+ coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas)
506
+ # COCOAPI requires every detection and every gt to have keypoints, so
507
+ # we just take the first entry from both
508
+ num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3
509
+ num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3
510
+ num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas)
511
+ assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, (
512
+ f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. "
513
+ f"Ground truth contains {num_keypoints_gt} keypoints. "
514
+ f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. "
515
+ "They have to agree with each other. For meaning of OKS, please refer to "
516
+ "http://cocodataset.org/#keypoints-eval."
517
+ )
518
+
519
+ coco_eval.evaluate()
520
+ coco_eval.accumulate()
521
+ coco_eval.summarize()
522
+
523
+ return coco_eval
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .mask_branch import build_mask_branch
2
+ from .dynamic_mask_head import build_dynamic_mask_head
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/dynamic_mask_head.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+ from torch import nn
4
+
5
+ from ..det_head.utils.comm import compute_locations, aligned_bilinear
6
+ from fvcore.nn import sigmoid_focal_loss_jit
7
+ from .utils import sigmoid_focal_loss_boundary, sigmoid_focal_loss_boundary_jit
8
+ import pdb
9
+
10
+ def dice_coefficient(x, target):
11
+ eps = 1e-5
12
+ n_inst = x.size(0)
13
+ x = x.reshape(n_inst, -1)
14
+ target = target.reshape(n_inst, -1)
15
+ intersection = (x * target).sum(dim=1)
16
+ union = (x ** 2.0).sum(dim=1) + (target ** 2.0).sum(dim=1) + eps
17
+ loss = 1. - (2 * intersection / union)
18
+ return loss
19
+
20
+ def parse_dynamic_params(params, channels, weight_nums, bias_nums):
21
+ assert params.dim() == 2
22
+ assert len(weight_nums) == len(bias_nums)
23
+ assert params.size(1) == sum(weight_nums) + sum(bias_nums)
24
+
25
+ num_insts = params.size(0)
26
+ num_layers = len(weight_nums)
27
+
28
+ params_splits = list(torch.split_with_sizes(params, weight_nums + bias_nums, dim=1))
29
+
30
+ weight_splits = params_splits[:num_layers]
31
+ bias_splits = params_splits[num_layers:]
32
+
33
+ for l in range(num_layers):
34
+ if l < num_layers - 1:
35
+ # out_channels x in_channels x 1 x 1
36
+ weight_splits[l] = weight_splits[l].reshape(num_insts * channels, -1, 1, 1)
37
+ bias_splits[l] = bias_splits[l].reshape(num_insts * channels)
38
+ else:
39
+ # out_channels x in_channels x 1 x 1
40
+ weight_splits[l] = weight_splits[l].reshape(num_insts * 1, -1, 1, 1)
41
+ bias_splits[l] = bias_splits[l].reshape(num_insts)
42
+
43
+ return weight_splits, bias_splits
44
+
45
+ def build_dynamic_mask_head(cfg):
46
+ return DynamicMaskHead(cfg)
47
+
48
+ class DynamicMaskHead(nn.Module):
49
+ def __init__(self, cfg):
50
+ super(DynamicMaskHead, self).__init__()
51
+ self.num_layers = cfg.MODEL.CONDINST.MASK_HEAD.NUM_LAYERS
52
+ self.channels = cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS
53
+ self.in_channels = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
54
+ self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
55
+ self.disable_rel_coords = cfg.MODEL.CONDINST.MASK_HEAD.DISABLE_REL_COORDS
56
+ self.cluster_weight = cfg.MODEL.CONDINST.MASK_HEAD.CLUSTER_WEIGHT
57
+
58
+ soi = [64,128,256,512,1024]
59
+ # self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2]))
60
+ self.register_buffer("sizes_of_interest", torch.tensor(soi))
61
+
62
+ weight_nums, bias_nums = [], []
63
+ for l in range(self.num_layers):
64
+ if l == 0:
65
+ if not self.disable_rel_coords:
66
+ weight_nums.append((self.in_channels + 2) * self.channels)
67
+ else:
68
+ weight_nums.append(self.in_channels * self.channels)
69
+ bias_nums.append(self.channels)
70
+ elif l == self.num_layers - 1:
71
+ weight_nums.append(self.channels * 1)
72
+ bias_nums.append(1)
73
+ else:
74
+ weight_nums.append(self.channels * self.channels)
75
+ bias_nums.append(self.channels)
76
+
77
+ self.weight_nums = weight_nums
78
+ self.bias_nums = bias_nums
79
+ self.num_gen_params = sum(weight_nums) + sum(bias_nums)
80
+
81
+ stable_conv_1 = nn.Sequential(nn.Conv2d(10,8,kernel_size=3, stride=1, padding=1),nn.ReLU())
82
+ torch.nn.init.normal_(stable_conv_1[0].weight, std=0.01)
83
+ torch.nn.init.constant_(stable_conv_1[0].bias, 0)
84
+
85
+ stable_conv_2 = nn.Sequential(nn.Conv2d(8,8,kernel_size=3, stride=1, padding=1),nn.ReLU())
86
+ torch.nn.init.normal_(stable_conv_2[0].weight, std=0.01)
87
+ torch.nn.init.constant_(stable_conv_2[0].bias, 0)
88
+
89
+ stable_conv_3 = nn.Conv2d(8, 1, kernel_size=3, stride=1, padding=1)
90
+ torch.nn.init.normal_(stable_conv_3.weight, std=0.01)
91
+ torch.nn.init.constant_(stable_conv_3.bias, 0)
92
+ self.stable = nn.ModuleList([stable_conv_1, stable_conv_2, stable_conv_3])
93
+
94
+ self.general_choose = cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC
95
+ self.general_choose_weight = cfg.MODEL.CONDINST.MASK_HEAD.DYNAMIC_WEIGHT
96
+ self.key_weight = dict()
97
+ for key, value in zip(self.general_choose, self.general_choose_weight):
98
+ self.key_weight[key]=value
99
+
100
+
101
+ def mask_heads_forward(self, features, weights, biases, num_insts):
102
+ '''
103
+ :param features
104
+ :param weights: [w0, w1, ...]
105
+ :param bias: [b0, b1, ...]
106
+ :return:
107
+ '''
108
+ assert features.dim() == 4
109
+ n_layers = len(weights)
110
+ x = features
111
+ mid_features = []
112
+ for i, (w, b) in enumerate(zip(weights, biases)):
113
+ x = F.conv2d(x, w, bias=b, stride=1, padding=0, groups=num_insts)
114
+ if i < n_layers - 1:
115
+ x = F.relu(x)
116
+ mid_features.append(x)
117
+ return x, mid_features
118
+
119
+ def mask_heads_forward_split(self, features, weight, bias, num_insts, has_relu=True):
120
+ '''
121
+ :param features
122
+ :param weights: [w0, w1, ...]
123
+ :param bias: [b0, b1, ...]
124
+ :return:
125
+ '''
126
+ assert features.dim() == 4
127
+ # n_layers = len(weights)
128
+ x = features
129
+ x = F.conv2d(x, weight, bias=bias, stride=1, padding=0, groups=num_insts)
130
+ if has_relu:
131
+ x = F.relu(x)
132
+ return x
133
+
134
+ def mask_heads_forward_with_coords_test(self, mask_feats, mask_feat_stride, instances):
135
+ locations = compute_locations(mask_feats.size(2), mask_feats.size(3), stride=mask_feat_stride, device=mask_feats.device)
136
+ n_inst = len(instances)
137
+
138
+ im_inds = instances.im_inds
139
+ mask_head_params = instances.mask_head_params
140
+
141
+ N, _, H, W = mask_feats.size()
142
+
143
+ if not self.disable_rel_coords:
144
+ instance_locations = instances.locations
145
+ relative_coords = instance_locations.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
146
+ relative_coords = relative_coords.permute(0, 2, 1).float()
147
+ soi = self.sizes_of_interest.float()[instances.fpn_levels]
148
+ relative_coords = relative_coords / soi.reshape(-1, 1, 1)
149
+ relative_coords = relative_coords.to(dtype=mask_feats.dtype)
150
+
151
+ mask_head_inputs = torch.cat([
152
+ relative_coords, mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
153
+ ], dim=1)
154
+ else:
155
+ mask_head_inputs = mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
156
+
157
+ mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
158
+
159
+ weights, biases = parse_dynamic_params(
160
+ mask_head_params, self.channels,
161
+ self.weight_nums, self.bias_nums
162
+ )
163
+
164
+ mask_logits, mid_features = self.mask_heads_forward(mask_head_inputs, weights, biases, n_inst)
165
+
166
+ mask_logits = mask_logits.reshape(-1, 1, H, W)
167
+
168
+ assert mask_feat_stride >= self.mask_out_stride
169
+ assert mask_feat_stride % self.mask_out_stride == 0
170
+ mask_logits = aligned_bilinear(mask_logits, int(mask_feat_stride / self.mask_out_stride))
171
+
172
+ return mask_logits.sigmoid()
173
+
174
+ def mask_heads_forward_with_coords(self, mask_feats, mask_feat_stride, instances, gt_bitmasks, ignore_maps):
175
+ locations = compute_locations(mask_feats.size(2), mask_feats.size(3), stride=mask_feat_stride, device=mask_feats.device)
176
+ n_inst = len(instances)
177
+
178
+ im_inds = instances.im_inds
179
+ mask_head_params = instances.mask_head_params
180
+
181
+ # clusters
182
+ gt_inds = instances.gt_inds
183
+ instance_locations = instances.locations
184
+ fpn_levels = instances.fpn_levels
185
+
186
+ clusters_ids = []
187
+ clusters_imgids = []
188
+ clusters_gt_masks = []
189
+ gt_unique_inds = torch.unique(gt_inds)
190
+ for gt_ind in gt_unique_inds:
191
+ gt_ind = int(gt_ind)
192
+ clusters_gt_masks.append(gt_bitmasks[gt_ind])
193
+ im_ind = int(torch.unique(im_inds[(gt_inds == gt_ind)]))
194
+ clusters_ids.append(gt_ind)
195
+ clusters_imgids.append(im_ind)
196
+
197
+ clusters_ids = torch.tensor(clusters_ids).cuda()
198
+ clusters_imgids = torch.tensor(clusters_imgids)
199
+ clusters_gt_masks = torch.stack(clusters_gt_masks, dim=0)
200
+ n_clusters = len(clusters_ids)
201
+
202
+ N, _, H, W = mask_feats.size()
203
+
204
+ if not self.disable_rel_coords:
205
+ instance_locations = instances.locations
206
+ relative_coords = instance_locations.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
207
+ relative_coords = relative_coords.permute(0, 2, 1).float()
208
+ soi = self.sizes_of_interest.float()[instances.fpn_levels]
209
+ relative_coords = relative_coords / soi.reshape(-1, 1, 1)
210
+ relative_coords = relative_coords.to(dtype=mask_feats.dtype)
211
+ mask_head_inputs = torch.cat([relative_coords, mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)], dim=1)
212
+ else:
213
+ mask_head_inputs = mask_feats[im_inds].reshape(n_inst, self.in_channels, H * W)
214
+
215
+ # mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
216
+ mask_head_inputs = mask_head_inputs.reshape(n_inst, self.in_channels+2, H, W)
217
+ weights, biases = parse_dynamic_params(mask_head_params, self.channels, self.weight_nums, self.bias_nums)
218
+
219
+ feature0 = self.stable[0](mask_head_inputs)
220
+ feature1 = self.mask_heads_forward_split(mask_head_inputs.reshape(1, -1, H, W), weights[0], biases[0], n_inst).reshape(n_inst, -1, H, W)
221
+
222
+ feature00 = self.stable[1](feature0)
223
+ feature01 = self.mask_heads_forward_split(feature0.reshape(1, -1, H, W), weights[1], biases[1], n_inst).reshape(n_inst, -1, H, W)
224
+ feature10 = self.stable[1](feature1)
225
+ feature11 = self.mask_heads_forward_split(feature1.reshape(1, -1, H, W), weights[1], biases[1], n_inst).reshape(n_inst, -1, H, W)
226
+
227
+ feature001 = self.mask_heads_forward_split(feature00.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
228
+ feature010 = self.stable[2](feature01)
229
+ feature011 = self.mask_heads_forward_split(feature01.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
230
+
231
+ feature100 = self.stable[2](feature10)
232
+ feature101 = self.mask_heads_forward_split(feature10.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
233
+ feature110 = self.stable[2](feature11)
234
+ feature111 = self.mask_heads_forward_split(feature11.reshape(1, -1, H, W), weights[2], biases[2], n_inst, has_relu=False).reshape(n_inst, -1, H, W)
235
+
236
+ mask_logits_clusters = []
237
+ for gt_ind in clusters_ids:
238
+ gt_ind = int(gt_ind)
239
+ mask_logits_clusters.append(torch.mean(feature111[gt_inds==gt_ind], dim=0))
240
+ mask_logits_clusters = torch.stack(mask_logits_clusters, dim=0)
241
+ mask_logits_clusters = mask_logits_clusters.reshape(-1, 1, H, W)
242
+ mask_logits_clusters = aligned_bilinear(mask_logits_clusters, int(mask_feat_stride / self.mask_out_stride))
243
+ # clusters
244
+ unique_img_inds = torch.unique(clusters_imgids)
245
+ mask_logits_clusters_imgs = []
246
+ mask_gt_clusters_imgs = []
247
+ for img_ind in unique_img_inds:
248
+ img_ind = int(img_ind)
249
+ mask_logits_clusters_per_img = mask_logits_clusters[clusters_imgids==img_ind]
250
+ mask_logits_clusters_per_img = F.softmax(mask_logits_clusters_per_img.squeeze(1),dim=0).unsqueeze(1)
251
+
252
+ ignore_map = ignore_maps[img_ind].detach()
253
+ finds_y, finds_x = torch.nonzero(ignore_map, as_tuple=True)
254
+
255
+ mask_logits_clusters_per_img = mask_logits_clusters_per_img.clone()
256
+ mask_logits_clusters_per_img[...,finds_y,finds_x] = 0
257
+
258
+ mask_logits_clusters_imgs.append(mask_logits_clusters_per_img)
259
+ mask_gt_clusters_imgs.append(clusters_gt_masks[clusters_imgids==img_ind])
260
+ mask_logits_clusters_imgs = torch.cat(mask_logits_clusters_imgs, dim=0)
261
+ mask_gt_clusters_imgs = torch.cat(mask_gt_clusters_imgs, dim=0)
262
+
263
+ select_features = {}
264
+ for cid in self.general_choose:
265
+ select_feature = locals()["feature{}".format(cid)]
266
+ select_feature = aligned_bilinear(select_feature, int(mask_feat_stride / self.mask_out_stride))
267
+ select_features[cid] = select_feature.sigmoid()
268
+
269
+ return select_features, mask_logits_clusters_imgs, mask_gt_clusters_imgs.unsqueeze(1)
270
+
271
+ def __call__(self, mask_feats, mask_feat_stride, pred_instances, gt_instances=None):
272
+ if self.training:
273
+ gt_inds = pred_instances.gt_inds
274
+ gt_bitmasks_s = torch.cat([per_im.gt_bitmasks for per_im in gt_instances])
275
+ gt_bitmasks = gt_bitmasks_s[gt_inds].unsqueeze(dim=1).to(dtype=mask_feats.dtype)
276
+
277
+ bitmasks_full = []
278
+ for gt_instance in gt_instances:
279
+ bitmasks_full.append(gt_instance.gt_bitmasks.sum(dim=0))
280
+ bitmasks_full = torch.stack(bitmasks_full)
281
+ ignore_map = 1-bitmasks_full
282
+
283
+ losses = {}
284
+ if len(pred_instances) == 0:
285
+ loss_mask = mask_feats.sum() * 0 + pred_instances.mask_head_params.sum() * 0
286
+ for key, value in self.key_weight.items():
287
+ losses["loss_mask_bank_{}".format(key)] = loss_mask
288
+ losses["loss_mask_cluster"] = loss_mask
289
+ else:
290
+ select_scores, mask_logits_clusters, mask_gts_clusters = self.mask_heads_forward_with_coords(mask_feats, mask_feat_stride, pred_instances, gt_bitmasks_s, ignore_map)
291
+ for key, value in select_scores.items():
292
+ losses["loss_mask_bank_{}".format(key)] = dice_coefficient(value, gt_bitmasks).mean() * self.key_weight[key]
293
+
294
+ mask_clusters_losses = dice_coefficient(mask_logits_clusters, mask_gts_clusters)
295
+ mask_clusters_losses = mask_clusters_losses.mean()
296
+ losses["loss_mask_cluster"] = mask_clusters_losses * self.cluster_weight
297
+ return losses
298
+ else:
299
+ if len(pred_instances) > 0:
300
+ mask_scores = self.mask_heads_forward_with_coords_test(mask_feats, mask_feat_stride, pred_instances)
301
+ pred_instances.pred_global_masks = mask_scores.float()
302
+
303
+ return pred_instances
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/mask_branch.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+ import math
3
+
4
+ import torch
5
+ from torch import nn
6
+ import pdb
7
+ from fvcore.nn import sigmoid_focal_loss_jit
8
+ from detectron2.layers import ShapeSpec
9
+
10
+ from ..det_head.layers import conv_with_kaiming_uniform
11
+ from ..det_head.utils.comm import aligned_bilinear
12
+
13
+ INF = 100000000
14
+
15
+ def build_mask_branch(cfg, input_shape):
16
+ return MaskBranch(cfg, input_shape)
17
+
18
+ class MaskBranch(nn.Module):
19
+ def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
20
+ super().__init__()
21
+ self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES
22
+ self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON
23
+ self.num_outputs = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
24
+ norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM
25
+ num_convs = cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS
26
+ channels = cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS
27
+ self.out_stride = input_shape[self.in_features[0]].stride
28
+
29
+ feature_channels = {k: v.channels for k, v in input_shape.items()}
30
+
31
+ conv_block = conv_with_kaiming_uniform(norm, activation=True)
32
+
33
+ self.refine = nn.ModuleList()
34
+ for in_feature in self.in_features:
35
+ self.refine.append(conv_block(
36
+ feature_channels[in_feature],
37
+ channels, 3, 1
38
+ ))
39
+
40
+ tower = []
41
+ for i in range(num_convs):
42
+ tower.append(conv_block(
43
+ channels, channels, 3, 1
44
+ ))
45
+ tower.append(nn.Conv2d(
46
+ channels, max(self.num_outputs, 1), 1
47
+ ))
48
+ self.add_module('tower', nn.Sequential(*tower))
49
+
50
+ def forward(self, features, gt_instances=None):
51
+ for i, f in enumerate(self.in_features):
52
+ if i == 0:
53
+ x = self.refine[i](features[f])
54
+ else:
55
+ x_p = self.refine[i](features[f])
56
+
57
+ target_h, target_w = x.size()[2:]
58
+ h, w = x_p.size()[2:]
59
+ assert target_h % h == 0
60
+ assert target_w % w == 0
61
+ factor_h, factor_w = target_h // h, target_w // w
62
+ assert factor_h == factor_w
63
+ x_p = aligned_bilinear(x_p, factor_h)
64
+ x = x + x_p
65
+
66
+ mask_feats = self.tower(x)
67
+
68
+ if self.num_outputs == 0:
69
+ mask_feats = mask_feats[:, :self.num_outputs]
70
+
71
+ return mask_feats
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/mask_head/utils.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+ import pdb
4
+
5
+ def sigmoid_focal_loss_boundary(
6
+ inputs: torch.Tensor,
7
+ targets: torch.Tensor,
8
+ boundary: torch.Tensor,
9
+ alpha: float = -1,
10
+ gamma: float = 2,
11
+ reduction: str = "none",
12
+ ) -> torch.Tensor:
13
+ """
14
+ Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
15
+ Args:
16
+ inputs: A float tensor of arbitrary shape.
17
+ The predictions for each example.
18
+ targets: A float tensor with the same shape as inputs. Stores the binary
19
+ classification label for each element in inputs
20
+ (0 for the negative class and 1 for the positive class).
21
+ alpha: (optional) Weighting factor in range (0,1) to balance
22
+ positive vs negative examples. Default = -1 (no weighting).
23
+ gamma: Exponent of the modulating factor (1 - p_t) to
24
+ balance easy vs hard examples.
25
+ reduction: 'none' | 'mean' | 'sum'
26
+ 'none': No reduction will be applied to the output.
27
+ 'mean': The output will be averaged.
28
+ 'sum': The output will be summed.
29
+ Returns:
30
+ Loss tensor with the reduction option applied.
31
+ """
32
+ p = torch.sigmoid(inputs)
33
+ ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
34
+ p_t = p * targets + (1 - p) * (1 - targets)
35
+ loss = ce_loss * ((1 - p_t) ** gamma)
36
+
37
+ if alpha >= 0:
38
+ alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
39
+ loss = alpha_t * loss
40
+
41
+ loss = loss * boundary
42
+ # pdb.set_trace()
43
+ if reduction == "mean":
44
+ loss = loss.mean()
45
+ elif reduction == "sum":
46
+ loss = loss.sum()
47
+
48
+ return loss
49
+
50
+
51
+ sigmoid_focal_loss_boundary_jit = torch.jit.script(
52
+ sigmoid_focal_loss_boundary
53
+ ) # type: torch.jit.ScriptModule
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/__init__.py ADDED
File without changes
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/deformable_conv_with_off.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding:utf-8 -*-
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ from detectron2.layers.deform_conv import DeformConv, ModulatedDeformConv
7
+
8
+
9
+ class DeformConvWithOff(nn.Module):
10
+
11
+ def __init__(self, in_channels, out_channels,
12
+ kernel_size=3, stride=1, padding=1,
13
+ dilation=1, deformable_groups=1):
14
+ super(DeformConvWithOff, self).__init__()
15
+ self.offset_conv = nn.Conv2d(
16
+ in_channels,
17
+ deformable_groups * 2 * kernel_size * kernel_size,
18
+ kernel_size=kernel_size,
19
+ stride=stride,
20
+ padding=padding,
21
+ )
22
+ self.dcn = DeformConv(
23
+ in_channels, out_channels, kernel_size=kernel_size,
24
+ stride=stride, padding=padding, dilation=dilation,
25
+ deformable_groups=deformable_groups,
26
+ )
27
+
28
+ def forward(self, input):
29
+ offset = self.offset_conv(input)
30
+ output = self.dcn(input, offset)
31
+ return output
32
+
33
+ class ModulatedDeformConvWithOff(nn.Module):
34
+ def __init__(self, in_channels, out_channels,
35
+ kernel_size=3, stride=1, padding=1,
36
+ dilation=1, deformable_groups=1,
37
+ bias=True, norm=None, activation=None,):
38
+ super(ModulatedDeformConvWithOff, self).__init__()
39
+ self.offset_mask_conv = nn.Conv2d(
40
+ in_channels,
41
+ deformable_groups * 3 * kernel_size * kernel_size,
42
+ kernel_size=kernel_size,
43
+ stride=stride,
44
+ padding=padding,
45
+ )
46
+ self.dcnv2 = ModulatedDeformConv(
47
+ in_channels, out_channels, kernel_size=kernel_size,
48
+ stride=stride, padding=padding, dilation=dilation,
49
+ deformable_groups=deformable_groups,
50
+ bias=bias, norm=norm, activation=activation,
51
+ )
52
+
53
+ def forward(self, input):
54
+ x = self.offset_mask_conv(input)
55
+ o1, o2, mask = torch.chunk(x, 3, dim=1)
56
+ offset = torch.cat((o1, o2), dim=1)
57
+ mask = torch.sigmoid(mask)
58
+ output = self.dcnv2(input, offset, mask)
59
+ return output
MaskClustering/third_party/Entity/Entity/EntitySeg/entityseg/panopticfcn_tools/panopticfcn_head.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding:utf-8 -*-
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+ from detectron2.layers import Conv2d, get_norm
8
+ from .deformable_conv_with_off import ModulatedDeformConvWithOff
9
+ from ..det_head.layers import conv_with_kaiming_uniform
10
+ import math
11
+ import pdb
12
+ from fvcore.nn import sigmoid_focal_loss_jit
13
+
14
+ class SingleHead(nn.Module):
15
+ """
16
+ Build single head with convolutions and coord conv.
17
+ """
18
+ def __init__(self, in_channel, conv_dims, num_convs, deform=False, coord=False, norm='', name=''):
19
+ super().__init__()
20
+ self.coord = coord
21
+ self.conv_norm_relus = []
22
+ if deform:
23
+ conv_module = ModulatedDeformConvWithOff
24
+ else:
25
+ conv_module = Conv2d
26
+ for k in range(num_convs):
27
+ conv = conv_module(
28
+ in_channel if k==0 else conv_dims,
29
+ conv_dims,
30
+ kernel_size=3,
31
+ stride=1,
32
+ padding=1,
33
+ bias=not norm,
34
+ norm=get_norm(norm, conv_dims),
35
+ activation=F.relu,
36
+ )
37
+ self.add_module("{}_head_{}".format(name, k + 1), conv)
38
+ self.conv_norm_relus.append(conv)
39
+
40
+ def forward(self, x):
41
+ if self.coord:
42
+ x = self.coord_conv(x)
43
+ for layer in self.conv_norm_relus:
44
+ x = layer(x)
45
+ return x
46
+
47
+ def coord_conv(self, feat):
48
+ with torch.no_grad():
49
+ x_pos = torch.linspace(-1, 1, feat.shape[-2], device=feat.device)
50
+ y_pos = torch.linspace(-1, 1, feat.shape[-1], device=feat.device)
51
+ grid_x, grid_y = torch.meshgrid(x_pos, y_pos)
52
+ grid_x = grid_x.unsqueeze(0).unsqueeze(1).expand(feat.shape[0], -1, -1, -1)
53
+ grid_y = grid_y.unsqueeze(0).unsqueeze(1).expand(feat.shape[0], -1, -1, -1)
54
+ feat = torch.cat([feat, grid_x, grid_y], dim=1)
55
+ return feat
56
+
57
+ class KernelHead(nn.Module):
58
+ """
59
+ The head used in PanopticFCN to generate kernel weights for both Things and Stuff.
60
+ """
61
+ def __init__(self, cfg, num_gen_params):
62
+ super().__init__()
63
+ in_channel = cfg.MODEL.FPN.OUT_CHANNELS
64
+ conv_dims = cfg.MODEL.KERNEL_HEAD.CONVS_DIM
65
+ num_convs = cfg.MODEL.KERNEL_HEAD.NUM_CONVS
66
+ deform = cfg.MODEL.KERNEL_HEAD.DEFORM
67
+ coord = cfg.MODEL.KERNEL_HEAD.COORD
68
+ norm = cfg.MODEL.KERNEL_HEAD.NORM
69
+
70
+ self.num_gen_params = num_gen_params
71
+
72
+ self.kernel_head = SingleHead(in_channel+2 if coord else in_channel,
73
+ conv_dims,
74
+ num_convs,
75
+ deform=deform,
76
+ coord=coord,
77
+ norm=norm,
78
+ name='kernel_head')
79
+ self.out_conv = Conv2d(conv_dims, self.num_gen_params, kernel_size=3, padding=1)
80
+ nn.init.normal_(self.out_conv.weight, mean=0, std=0.01)
81
+ if self.out_conv.bias is not None:
82
+ nn.init.constant_(self.out_conv.bias, 0)
83
+
84
+ def forward(self, feat):
85
+ x = self.kernel_head(feat)
86
+ x = self.out_conv(x)
87
+ return x
88
+
89
+
90
+ class FeatureEncoder(nn.Module):
91
+ """
92
+ The head used in PanopticFCN for high-resolution feature generation.
93
+ """
94
+ def __init__(self, cfg):
95
+ super().__init__()
96
+ in_channel = cfg.MODEL.SEMANTIC_FPN.CONVS_DIM
97
+ conv_dims = cfg.MODEL.FEATURE_ENCODER.CONVS_DIM
98
+ num_convs = cfg.MODEL.FEATURE_ENCODER.NUM_CONVS
99
+ deform = cfg.MODEL.FEATURE_ENCODER.DEFORM
100
+ coord = cfg.MODEL.FEATURE_ENCODER.COORD
101
+ norm = cfg.MODEL.FEATURE_ENCODER.NORM
102
+
103
+ self.encode_head = SingleHead(in_channel+2 if coord else in_channel,
104
+ conv_dims,
105
+ num_convs,
106
+ deform=deform,
107
+ coord=coord,
108
+ norm=norm,
109
+ name='encode_head')
110
+
111
+ def forward(self, feat):
112
+ feat = self.encode_head(feat)
113
+ return feat
114
+
115
+ class FeatureEncoderEdge(nn.Module):
116
+ """
117
+ The head used in PanopticFCN for high-resolution feature generation.
118
+ """
119
+ def __init__(self, cfg):
120
+ super().__init__()
121
+ in_channel = cfg.MODEL.SEMANTIC_FPN.CONVS_DIM
122
+ conv_dims = cfg.MODEL.FEATURE_ENCODER.CONVS_DIM
123
+ num_convs = cfg.MODEL.FEATURE_ENCODER.NUM_CONVS
124
+ deform = cfg.MODEL.FEATURE_ENCODER.DEFORM
125
+ coord = cfg.MODEL.FEATURE_ENCODER.COORD
126
+ norm = cfg.MODEL.FEATURE_ENCODER.NORM
127
+
128
+ self.encode_head = SingleHead(in_channel+2 if coord else in_channel,
129
+ conv_dims,
130
+ num_convs,
131
+ deform=deform,
132
+ coord=coord,
133
+ norm=norm,
134
+ name='encode_head')
135
+
136
+ self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES
137
+ self.out_stride = 8
138
+
139
+ norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM
140
+ conv_block = conv_with_kaiming_uniform(norm, activation=True)
141
+
142
+ self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON
143
+ if self.sem_loss_on:
144
+ self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA
145
+ self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA
146
+
147
+ # in_channels = feature_channels[self.in_features[0]]
148
+ self.seg_head = nn.Sequential(
149
+ conv_block(conv_dims, conv_dims, kernel_size=3, stride=1),
150
+ conv_block(conv_dims, conv_dims, kernel_size=3, stride=1)
151
+ )
152
+
153
+ self.logits = nn.Conv2d(conv_dims, 1, kernel_size=1, stride=1)
154
+
155
+ prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
156
+ bias_value = -math.log((1 - prior_prob) / prior_prob)
157
+ torch.nn.init.constant_(self.logits.bias, bias_value)
158
+
159
+ def forward(self, feat, gt_instances=None):
160
+ feat = self.encode_head(feat)
161
+
162
+ losses = {}
163
+ # auxiliary thing semantic loss
164
+ if self.training and self.sem_loss_on:
165
+ logits_pred = self.logits(self.seg_head(feat))
166
+
167
+ boundary_targets = []
168
+ for per_im_gt in gt_instances:
169
+ boundary_targets.append(per_im_gt.gt_boundary_full.sum(dim=0))
170
+
171
+ # # semantic_targets = torch.stack(semantic_targets, dim=0)
172
+ boundary_targets = torch.stack(boundary_targets, dim=0)
173
+
174
+ # resize target to reduce memory
175
+ boundary_targets = boundary_targets[:, None, self.out_stride // 2::self.out_stride,self.out_stride // 2::self.out_stride]
176
+ num_pos = (boundary_targets > 0).sum().float().clamp(min=1.0)
177
+
178
+ loss_edge = sigmoid_focal_loss_jit(logits_pred, boundary_targets, alpha=self.focal_loss_alpha, gamma=self.focal_loss_gamma, reduction="sum") / num_pos
179
+ losses['loss_edge_p3'] = loss_edge
180
+
181
+ return feat, losses
182
+
183
+ def build_feature_encoder(cfg, input_shape=None):
184
+ return FeatureEncoder(cfg)
185
+
186
+ def build_feature_encoder_edge(cfg, input_shape=None):
187
+ return FeatureEncoderEdge(cfg)
188
+
189
+ def build_kernel_head(cfg, num_gen_params):
190
+ return KernelHead(cfg, num_gen_params)
MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/entity_to_json.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import copy
3
+ import mmcv
4
+ import numpy as np
5
+ import pdb
6
+ import pycocotools.mask as mask_utils
7
+ from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
8
+
9
+ prefix = "train2017"
10
+ base_path = "/data/ceph/gavinqi/data/coco"
11
+
12
+ entity_base_path = os.path.join(base_path, "entity_{}".format(prefix))
13
+ annotation_path = os.path.join(base_path, "annotations/instances_{}.json".format(prefix))
14
+ save_thing_path = os.path.join(base_path, "annotations/entity_thing_{}.json".format(prefix))
15
+ save_stuff_path = os.path.join(base_path, "annotations/entity_stuff_{}.json".format(prefix))
16
+ save_entity_path = os.path.join(base_path, "annotations/entity_{}.json".format(prefix))
17
+
18
+ ## build catid to continous
19
+ categories_list = COCO_CATEGORIES
20
+ catid_map = {category['id']: [cid, category["isthing"], category["name"], category["supercategory"]] for cid, category in enumerate(categories_list)}
21
+ idcat_map = {}
22
+ for key, value in catid_map.items():
23
+ idcat_map[value[0]] = [key,value[1]]
24
+
25
+ instance_annotations = mmcv.load(annotation_path)
26
+ instance_annotations_thing = copy.deepcopy(instance_annotations)
27
+ instance_annotations_stuff = copy.deepcopy(instance_annotations)
28
+
29
+ # update category
30
+ print("Updating categories...")
31
+ instance_annotations_thing["categories"] = []
32
+ instance_annotations_stuff["categories"] = []
33
+ for origin_catid, new_catid_info in catid_map.items():
34
+ new_catid = new_catid_info[0]
35
+ is_thing = new_catid_info[1]
36
+ name = new_catid_info[2]
37
+ nsuper = new_catid_info[3]
38
+ if is_thing:
39
+ instance_annotations_thing["categories"].append({"supercategory": nsuper, "id": new_catid, "name": name})
40
+ else:
41
+ instance_annotations_stuff["categories"].append({"supercategory": nsuper, "id": new_catid, "name": name})
42
+ print("Update category finished")
43
+
44
+ # update annotations
45
+ instance_annotations_thing["annotations"] = []
46
+ instance_annotations_stuff["annotations"] = []
47
+ npz_names = os.listdir(entity_base_path)
48
+ thing_id = 0
49
+ stuff_id = 0
50
+
51
+ for index, npz_name in enumerate(npz_names):
52
+ entity_info = np.load(os.path.join(entity_base_path, npz_name))
53
+ image_id = int(npz_name.split(".")[0])
54
+ bounding_boxes = entity_info["bounding_box"]
55
+ entity_id_map = entity_info["map"]
56
+ entity_id_map = entity_id_map[0]
57
+ if len(bounding_boxes)==0:
58
+ continue
59
+ # 0-x1, 1-y1, 2-x2, 3-y2, 4-category, 5-thing_or_stuff, 6-entity_id
60
+ thing_mask = bounding_boxes[:,5] > 0
61
+ stuff_mask = bounding_boxes[:,5] == 0
62
+
63
+ # begin thing
64
+ thing_boxes = bounding_boxes[thing_mask]
65
+ for thing_box in thing_boxes:
66
+ x1, y1, x2, y2, category_id, thing_or_stuff, entity_id = thing_box
67
+ area = (y2-y1) * (x2-x1)
68
+ if "val" in prefix:
69
+ mask = (entity_id_map==entity_id)
70
+ mask = np.array(mask, order="F", dtype="uint8")
71
+ rle = mask_utils.encode(mask)
72
+ rle["counts"] = rle["counts"].decode("utf-8")
73
+
74
+ anno = {"iscrowd": 0,
75
+ "area": area,
76
+ "image_id": image_id,
77
+ "bbox": [x1, y1, x2-x1, y2-y1],
78
+ "category_id": category_id,
79
+ "id": thing_id}
80
+ if "val" in prefix:
81
+ anno["segmentation"]=rle
82
+
83
+ instance_annotations_thing["annotations"].append(anno)
84
+ thing_id = thing_id + 1
85
+
86
+ # begin stuff
87
+ stuff_boxes = bounding_boxes[stuff_mask]
88
+ for stuff_box in stuff_boxes:
89
+ x1, y1, x2, y2, category_id, thing_or_stuff, entity_id = stuff_box
90
+ area = (y2-y1) * (x2-x1)
91
+ if "val" in prefix:
92
+ mask = (entity_id_map==entity_id)
93
+ mask = np.array(mask, order="F", dtype="uint8")
94
+ rle = mask_utils.encode(mask)
95
+ rle["counts"] = rle["counts"].decode("utf-8")
96
+
97
+ anno = {"iscrowd": 0,
98
+ "area": area,
99
+ "image_id": image_id,
100
+ "bbox": [x1, y1, x2-x1, y2-y1],
101
+ "category_id": category_id,
102
+ "id": stuff_id}
103
+ if "val" in prefix:
104
+ anno["segmentation"]=rle
105
+
106
+ instance_annotations_stuff["annotations"].append(anno)
107
+ stuff_id = stuff_id + 1
108
+
109
+ print("{},{}".format(index, npz_name))
110
+
111
+ mmcv.dump(instance_annotations_thing, save_thing_path)
112
+ mmcv.dump(instance_annotations_stuff, save_stuff_path)
113
+
114
+ thing_info = instance_annotations_thing
115
+ stuff_info = instance_annotations_stuff
116
+
117
+ thst = thing_info
118
+ thst["categories"].extend(stuff_info["categories"])
119
+ nums = len(thst["annotations"]) + 1
120
+ for index, anno in enumerate(stuff_info["annotations"]):
121
+ anno["id"] = index + nums
122
+ thst["annotations"].append(anno)
123
+ mmcv.dump(thst, save_entity_path)
MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import
2
+ from __future__ import division
3
+ from __future__ import print_function
4
+ from __future__ import unicode_literals
5
+
6
+ import os, sys
7
+ import numpy as np
8
+ import pdb
9
+ import mmcv
10
+ import copy
11
+ import cv2
12
+ from collections import OrderedDict
13
+ from pycocotools.coco import COCO
14
+ import pycocotools.mask as mask_utils
15
+
16
+ import PIL.Image as Image
17
+ import matplotlib.pyplot as plt
18
+ from skimage.segmentation import find_boundaries
19
+ from panopticapi.utils import IdGenerator, rgb2id
20
+ from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
21
+
22
+ thread_num = int(sys.argv[1])
23
+ thread_idx = int(sys.argv[2])
24
+ type_ = sys.argv[3]
25
+
26
+ OFFSET = 256 * 256 * 256
27
+
28
+ GT_base_path = "/data/ceph/gavinqi/data/coco"
29
+ GT_panoptic_png_path = os.path.join(GT_base_path, "panoptic_{}".format(type_))
30
+ GT_panoptic_json_path = os.path.join(GT_base_path, "annotations/panoptic_{}.json".format(type_))
31
+ GT_instance_json_path = os.path.join(GT_base_path, "annotations/instances_{}.json".format(type_))
32
+ save_base_path = os.path.join(GT_base_path, "entity_{}".format(type_))
33
+
34
+ if not os.path.exists(save_base_path):
35
+ os.makedirs(save_base_path)
36
+
37
+ coco_g = mmcv.load(GT_panoptic_json_path)
38
+ categories_list = COCO_CATEGORIES
39
+ catid_map = {category['id']: [cid, category["isthing"]] for cid, category in enumerate(categories_list)}
40
+ idcat_map = {}
41
+ for key, value in catid_map.items():
42
+ idcat_map[value[0]] = [key,value[1]]
43
+
44
+ name2panopticindex = OrderedDict()
45
+ id2name = OrderedDict()
46
+
47
+ for i_index, image_info in enumerate(coco_g["images"]):
48
+ file_name = image_info["file_name"].split(".")[0]
49
+ name2panopticindex[file_name] = {"i_index": i_index}
50
+ id2name[image_info["id"]] = file_name
51
+
52
+ for a_index, ann in enumerate(coco_g["annotations"]):
53
+ file_name = id2name[ann["image_id"]]
54
+ name2panopticindex[file_name]["a_index"] = a_index
55
+ print("build name to panoptic index finished")
56
+
57
+ # imgs and instance_anns
58
+ instances_api = COCO(GT_instance_json_path)
59
+ img_ids = instances_api.getImgIds()
60
+ imgs = instances_api.loadImgs(img_ids)
61
+ instance_anns = [instances_api.imgToAnns[img_id] for img_id in img_ids]
62
+ assert len(name2panopticindex.keys()) == len(imgs)
63
+ imgs_instancesanns = list(zip(imgs, instance_anns))
64
+ print("build imgs and instance_anns finished")
65
+
66
+ for img_index, (img_dict, ann_dict_list) in enumerate(imgs_instancesanns):
67
+ if img_index % thread_num != thread_idx:
68
+ continue
69
+
70
+ file_name = img_dict["file_name"].split(".")[0]
71
+ image_h, image_w = img_dict["height"], img_dict["width"]
72
+
73
+ ## panoptic mask from panoptic annotation
74
+ panoptic_i_index, panoptic_a_index = name2panopticindex[file_name]["i_index"], name2panopticindex[file_name]["a_index"]
75
+ panoptic_img_infos = coco_g["images"][panoptic_i_index]
76
+ panoptic_ann_infos = coco_g["annotations"][panoptic_a_index]
77
+ assert panoptic_img_infos["file_name"].split(".")[0] == file_name, "Something wrong with panoptic_img_infos"
78
+ assert panoptic_ann_infos["file_name"].split(".")[0] == file_name, "Something wrong with panoptic_ann_infos"
79
+
80
+ panoptic = np.array(Image.open(os.path.join(GT_panoptic_png_path, file_name+".png")), dtype=np.uint8)
81
+ panoptic_id = rgb2id(panoptic)
82
+ panoptic_entity_id = np.zeros(panoptic_id.shape, dtype=np.uint8)
83
+ panoptic_class_id = np.zeros(panoptic_id.shape, dtype=np.uint8) + 255
84
+ unique_panoptic_id = np.unique(panoptic_id)
85
+
86
+ for ii, segment_info in enumerate(panoptic_ann_infos["segments_info"]):
87
+ if segment_info["iscrowd"] == 1:
88
+ continue
89
+ old_entity_id = segment_info["id"]
90
+ new_entity_id = ii + 1
91
+ category = segment_info["category_id"]
92
+ panoptic_entity_id[panoptic_id==old_entity_id] = new_entity_id
93
+ panoptic_class_id[panoptic_id==old_entity_id] = catid_map[category][0]
94
+
95
+ unique_ids = np.unique(panoptic_entity_id)
96
+ count = 1
97
+
98
+ bounding_box = []
99
+ for entity_id in unique_ids:
100
+ if entity_id == 0:
101
+ continue
102
+ mask = (panoptic_entity_id==entity_id).astype(np.uint8)
103
+ category = int(np.unique(panoptic_class_id[panoptic_entity_id==entity_id]))
104
+
105
+ finds_y, finds_x = np.where(mask==1)
106
+ y1 = int(np.min(finds_y))
107
+ y2 = int(np.max(finds_y))
108
+ x1 = int(np.min(finds_x))
109
+ x2 = int(np.max(finds_x))
110
+ thing_or_stuff = int(idcat_map[category][1])
111
+ bounding_box.append([x1,y1,x2,y2,category,thing_or_stuff,entity_id])
112
+
113
+ bounding_box = np.array(bounding_box)
114
+
115
+ panoptic_info = np.stack((panoptic_entity_id, panoptic_class_id), axis=0)
116
+ np.savez(os.path.join(save_base_path, file_name),map=panoptic_info, bounding_box=bounding_box)
117
+
118
+ print("{}, {}, {}".format(thread_idx, img_index, file_name))
119
+
MaskClustering/third_party/Entity/Entity/EntitySeg/make_data/make_entity_mask.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/bash
2
+ thread_num=8
3
+ for((i=0;i<${thread_num};i++));do
4
+ {
5
+ python3 make_entity_mask.py ${thread_num} ${i} train2017
6
+ }&
7
+ done
8
+ wait
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/Makefile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ all:
2
+ # install pycocotools locally
3
+ python setup.py build_ext --inplace
4
+ rm -rf build
5
+
6
+ install:
7
+ # install pycocotools to the Python site-packages
8
+ python setup.py build_ext install
9
+ rm -rf build
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __author__ = 'tylin'
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.c ADDED
The diff for this file is too large to render. See raw diff
 
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/_mask.pyx ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # distutils: language = c
2
+ # distutils: sources = ../common/maskApi.c
3
+
4
+ #**************************************************************************
5
+ # Microsoft COCO Toolbox. version 2.0
6
+ # Data, paper, and tutorials available at: http://mscoco.org/
7
+ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
8
+ # Licensed under the Simplified BSD License [see coco/license.txt]
9
+ #**************************************************************************
10
+
11
+ __author__ = 'tsungyi'
12
+
13
+ import sys
14
+ PYTHON_VERSION = sys.version_info[0]
15
+
16
+ # import both Python-level and C-level symbols of Numpy
17
+ # the API uses Numpy to interface C and Python
18
+ import numpy as np
19
+ cimport numpy as np
20
+ from libc.stdlib cimport malloc, free
21
+
22
+ # intialized Numpy. must do.
23
+ np.import_array()
24
+
25
+ # import numpy C function
26
+ # we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management
27
+ cdef extern from "numpy/arrayobject.h":
28
+ void PyArray_ENABLEFLAGS(np.ndarray arr, int flags)
29
+
30
+ # Declare the prototype of the C functions in MaskApi.h
31
+ cdef extern from "maskApi.h":
32
+ ctypedef unsigned int uint
33
+ ctypedef unsigned long siz
34
+ ctypedef unsigned char byte
35
+ ctypedef double* BB
36
+ ctypedef struct RLE:
37
+ siz h,
38
+ siz w,
39
+ siz m,
40
+ uint* cnts,
41
+ void rlesInit( RLE **R, siz n )
42
+ void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n )
43
+ void rleDecode( const RLE *R, byte *mask, siz n )
44
+ void rleMerge( const RLE *R, RLE *M, siz n, int intersect )
45
+ void rleArea( const RLE *R, siz n, uint *a )
46
+ void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o )
47
+ void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o )
48
+ void rleToBbox( const RLE *R, BB bb, siz n )
49
+ void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n )
50
+ void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w )
51
+ char* rleToString( const RLE *R )
52
+ void rleFrString( RLE *R, char *s, siz h, siz w )
53
+
54
+ # python class to wrap RLE array in C
55
+ # the class handles the memory allocation and deallocation
56
+ cdef class RLEs:
57
+ cdef RLE *_R
58
+ cdef siz _n
59
+
60
+ def __cinit__(self, siz n =0):
61
+ rlesInit(&self._R, n)
62
+ self._n = n
63
+
64
+ # free the RLE array here
65
+ def __dealloc__(self):
66
+ if self._R is not NULL:
67
+ for i in range(self._n):
68
+ free(self._R[i].cnts)
69
+ free(self._R)
70
+ def __getattr__(self, key):
71
+ if key == 'n':
72
+ return self._n
73
+ raise AttributeError(key)
74
+
75
+ # python class to wrap Mask array in C
76
+ # the class handles the memory allocation and deallocation
77
+ cdef class Masks:
78
+ cdef byte *_mask
79
+ cdef siz _h
80
+ cdef siz _w
81
+ cdef siz _n
82
+
83
+ def __cinit__(self, h, w, n):
84
+ self._mask = <byte*> malloc(h*w*n* sizeof(byte))
85
+ self._h = h
86
+ self._w = w
87
+ self._n = n
88
+ # def __dealloc__(self):
89
+ # the memory management of _mask has been passed to np.ndarray
90
+ # it doesn't need to be freed here
91
+
92
+ # called when passing into np.array() and return an np.ndarray in column-major order
93
+ def __array__(self):
94
+ cdef np.npy_intp shape[1]
95
+ shape[0] = <np.npy_intp> self._h*self._w*self._n
96
+ # Create a 1D array, and reshape it to fortran/Matlab column-major array
97
+ ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F')
98
+ # The _mask allocated by Masks is now handled by ndarray
99
+ PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA)
100
+ return ndarray
101
+
102
+ # internal conversion from Python RLEs object to compressed RLE format
103
+ def _toString(RLEs Rs):
104
+ cdef siz n = Rs.n
105
+ cdef bytes py_string
106
+ cdef char* c_string
107
+ objs = []
108
+ for i in range(n):
109
+ c_string = rleToString( <RLE*> &Rs._R[i] )
110
+ py_string = c_string
111
+ objs.append({
112
+ 'size': [Rs._R[i].h, Rs._R[i].w],
113
+ 'counts': py_string
114
+ })
115
+ free(c_string)
116
+ return objs
117
+
118
+ # internal conversion from compressed RLE format to Python RLEs object
119
+ def _frString(rleObjs):
120
+ cdef siz n = len(rleObjs)
121
+ Rs = RLEs(n)
122
+ cdef bytes py_string
123
+ cdef char* c_string
124
+ for i, obj in enumerate(rleObjs):
125
+ if PYTHON_VERSION == 2:
126
+ py_string = str(obj['counts']).encode('utf8')
127
+ elif PYTHON_VERSION == 3:
128
+ py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts']
129
+ else:
130
+ raise Exception('Python version must be 2 or 3')
131
+ c_string = py_string
132
+ rleFrString( <RLE*> &Rs._R[i], <char*> c_string, obj['size'][0], obj['size'][1] )
133
+ return Rs
134
+
135
+ # encode mask to RLEs objects
136
+ # list of RLE string can be generated by RLEs member function
137
+ def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask):
138
+ h, w, n = mask.shape[0], mask.shape[1], mask.shape[2]
139
+ cdef RLEs Rs = RLEs(n)
140
+ rleEncode(Rs._R,<byte*>mask.data,h,w,n)
141
+ objs = _toString(Rs)
142
+ return objs
143
+
144
+ # decode mask from compressed list of RLE string or RLEs object
145
+ def decode(rleObjs):
146
+ cdef RLEs Rs = _frString(rleObjs)
147
+ h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n
148
+ masks = Masks(h, w, n)
149
+ rleDecode(<RLE*>Rs._R, masks._mask, n);
150
+ return np.array(masks)
151
+
152
+ def merge(rleObjs, intersect=0):
153
+ cdef RLEs Rs = _frString(rleObjs)
154
+ cdef RLEs R = RLEs(1)
155
+ rleMerge(<RLE*>Rs._R, <RLE*> R._R, <siz> Rs._n, intersect)
156
+ obj = _toString(R)[0]
157
+ return obj
158
+
159
+ def area(rleObjs):
160
+ cdef RLEs Rs = _frString(rleObjs)
161
+ cdef uint* _a = <uint*> malloc(Rs._n* sizeof(uint))
162
+ rleArea(Rs._R, Rs._n, _a)
163
+ cdef np.npy_intp shape[1]
164
+ shape[0] = <np.npy_intp> Rs._n
165
+ a = np.array((Rs._n, ), dtype=np.uint8)
166
+ a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a)
167
+ PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA)
168
+ return a
169
+
170
+ # iou computation. support function overload (RLEs-RLEs and bbox-bbox).
171
+ def iou( dt, gt, pyiscrowd ):
172
+ def _preproc(objs):
173
+ if len(objs) == 0:
174
+ return objs
175
+ if type(objs) == np.ndarray:
176
+ if len(objs.shape) == 1:
177
+ objs = objs.reshape((objs[0], 1))
178
+ # check if it's Nx4 bbox
179
+ if not len(objs.shape) == 2 or not objs.shape[1] == 4:
180
+ raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension')
181
+ objs = objs.astype(np.double)
182
+ elif type(objs) == list:
183
+ # check if list is in box format and convert it to np.ndarray
184
+ isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs]))
185
+ isrle = np.all(np.array([type(obj) == dict for obj in objs]))
186
+ if isbox:
187
+ objs = np.array(objs, dtype=np.double)
188
+ if len(objs.shape) == 1:
189
+ objs = objs.reshape((1,objs.shape[0]))
190
+ elif isrle:
191
+ objs = _frString(objs)
192
+ else:
193
+ raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])')
194
+ else:
195
+ raise Exception('unrecognized type. The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.')
196
+ return objs
197
+ def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
198
+ rleIou( <RLE*> dt._R, <RLE*> gt._R, m, n, <byte*> iscrowd.data, <double*> _iou.data )
199
+ def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou):
200
+ bbIou( <BB> dt.data, <BB> gt.data, m, n, <byte*> iscrowd.data, <double*>_iou.data )
201
+ def _len(obj):
202
+ cdef siz N = 0
203
+ if type(obj) == RLEs:
204
+ N = obj.n
205
+ elif len(obj)==0:
206
+ pass
207
+ elif type(obj) == np.ndarray:
208
+ N = obj.shape[0]
209
+ return N
210
+ # convert iscrowd to numpy array
211
+ cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8)
212
+ # simple type checking
213
+ cdef siz m, n
214
+ dt = _preproc(dt)
215
+ gt = _preproc(gt)
216
+ m = _len(dt)
217
+ n = _len(gt)
218
+ if m == 0 or n == 0:
219
+ return []
220
+ if not type(dt) == type(gt):
221
+ raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray')
222
+
223
+ # define local variables
224
+ cdef double* _iou = <double*> 0
225
+ cdef np.npy_intp shape[1]
226
+ # check type and assign iou function
227
+ if type(dt) == RLEs:
228
+ _iouFun = _rleIou
229
+ elif type(dt) == np.ndarray:
230
+ _iouFun = _bbIou
231
+ else:
232
+ raise Exception('input data type not allowed.')
233
+ _iou = <double*> malloc(m*n* sizeof(double))
234
+ iou = np.zeros((m*n, ), dtype=np.double)
235
+ shape[0] = <np.npy_intp> m*n
236
+ iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou)
237
+ PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA)
238
+ _iouFun(dt, gt, iscrowd, m, n, iou)
239
+ return iou.reshape((m,n), order='F')
240
+
241
+ def toBbox( rleObjs ):
242
+ cdef RLEs Rs = _frString(rleObjs)
243
+ cdef siz n = Rs.n
244
+ cdef BB _bb = <BB> malloc(4*n* sizeof(double))
245
+ rleToBbox( <const RLE*> Rs._R, _bb, n )
246
+ cdef np.npy_intp shape[1]
247
+ shape[0] = <np.npy_intp> 4*n
248
+ bb = np.array((1,4*n), dtype=np.double)
249
+ bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4))
250
+ PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA)
251
+ return bb
252
+
253
+ def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ):
254
+ cdef siz n = bb.shape[0]
255
+ Rs = RLEs(n)
256
+ rleFrBbox( <RLE*> Rs._R, <const BB> bb.data, h, w, n )
257
+ objs = _toString(Rs)
258
+ return objs
259
+
260
+ def frPoly( poly, siz h, siz w ):
261
+ cdef np.ndarray[np.double_t, ndim=1] np_poly
262
+ n = len(poly)
263
+ Rs = RLEs(n)
264
+ for i, p in enumerate(poly):
265
+ np_poly = np.array(p, dtype=np.double, order='F')
266
+ rleFrPoly( <RLE*>&Rs._R[i], <const double*> np_poly.data, int(len(p)/2), h, w )
267
+ objs = _toString(Rs)
268
+ return objs
269
+
270
+ def frUncompressedRLE(ucRles, siz h, siz w):
271
+ cdef np.ndarray[np.uint32_t, ndim=1] cnts
272
+ cdef RLE R
273
+ cdef uint *data
274
+ n = len(ucRles)
275
+ objs = []
276
+ for i in range(n):
277
+ Rs = RLEs(1)
278
+ cnts = np.array(ucRles[i]['counts'], dtype=np.uint32)
279
+ # time for malloc can be saved here but it's fine
280
+ data = <uint*> malloc(len(cnts)* sizeof(uint))
281
+ for j in range(len(cnts)):
282
+ data[j] = <uint> cnts[j]
283
+ R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), <uint*> data)
284
+ Rs._R[0] = R
285
+ objs.append(_toString(Rs)[0])
286
+ return objs
287
+
288
+ def frPyObjects(pyobj, h, w):
289
+ # encode rle from a list of python objects
290
+ if type(pyobj) == np.ndarray:
291
+ objs = frBbox(pyobj, h, w)
292
+ elif type(pyobj) == list and len(pyobj[0]) == 4:
293
+ objs = frBbox(pyobj, h, w)
294
+ elif type(pyobj) == list and len(pyobj[0]) > 4:
295
+ objs = frPoly(pyobj, h, w)
296
+ elif type(pyobj) == list and type(pyobj[0]) == dict \
297
+ and 'counts' in pyobj[0] and 'size' in pyobj[0]:
298
+ objs = frUncompressedRLE(pyobj, h, w)
299
+ # encode rle from single python object
300
+ elif type(pyobj) == list and len(pyobj) == 4:
301
+ objs = frBbox([pyobj], h, w)[0]
302
+ elif type(pyobj) == list and len(pyobj) > 4:
303
+ objs = frPoly([pyobj], h, w)[0]
304
+ elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj:
305
+ objs = frUncompressedRLE([pyobj], h, w)[0]
306
+ else:
307
+ raise Exception('input type is not supported.')
308
+ return objs
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/coco.py ADDED
@@ -0,0 +1,453 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __author__ = 'tylin'
2
+ __version__ = '2.0'
3
+ # Interface for accessing the Microsoft COCO dataset.
4
+
5
+ # Microsoft COCO is a large image dataset designed for object detection,
6
+ # segmentation, and caption generation. pycocotools is a Python API that
7
+ # assists in loading, parsing and visualizing the annotations in COCO.
8
+ # Please visit http://mscoco.org/ for more information on COCO, including
9
+ # for the data, paper, and tutorials. The exact format of the annotations
10
+ # is also described on the COCO website. For example usage of the pycocotools
11
+ # please see pycocotools_demo.ipynb. In addition to this API, please download both
12
+ # the COCO images and annotations in order to run the demo.
13
+
14
+ # An alternative to using the API is to load the annotations directly
15
+ # into Python dictionary
16
+ # Using the API provides additional utility functions. Note that this API
17
+ # supports both *instance* and *caption* annotations. In the case of
18
+ # captions not all functions are defined (e.g. categories are undefined).
19
+
20
+ # The following API functions are defined:
21
+ # COCO - COCO api class that loads COCO annotation file and prepare data structures.
22
+ # decodeMask - Decode binary mask M encoded via run-length encoding.
23
+ # encodeMask - Encode binary mask M using run-length encoding.
24
+ # getAnnIds - Get ann ids that satisfy given filter conditions.
25
+ # getCatIds - Get cat ids that satisfy given filter conditions.
26
+ # getImgIds - Get img ids that satisfy given filter conditions.
27
+ # loadAnns - Load anns with the specified ids.
28
+ # loadCats - Load cats with the specified ids.
29
+ # loadImgs - Load imgs with the specified ids.
30
+ # annToMask - Convert segmentation in an annotation to binary mask.
31
+ # showAnns - Display the specified annotations.
32
+ # loadRes - Load algorithm results and create API for accessing them.
33
+ # download - Download COCO images from mscoco.org server.
34
+ # Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
35
+ # Help on each functions can be accessed by: "help COCO>function".
36
+
37
+ # See also COCO>decodeMask,
38
+ # COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
39
+ # COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
40
+ # COCO>loadImgs, COCO>annToMask, COCO>showAnns
41
+
42
+ # Microsoft COCO Toolbox. version 2.0
43
+ # Data, paper, and tutorials available at: http://mscoco.org/
44
+ # Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
45
+ # Licensed under the Simplified BSD License [see bsd.txt]
46
+
47
+ import json
48
+ import time
49
+ import matplotlib.pyplot as plt
50
+ from matplotlib.collections import PatchCollection
51
+ from matplotlib.patches import Polygon
52
+ import numpy as np
53
+ import copy
54
+ import itertools
55
+ from . import mask as maskUtils
56
+ import os
57
+ from collections import defaultdict
58
+ import sys
59
+ PYTHON_VERSION = sys.version_info[0]
60
+ if PYTHON_VERSION == 2:
61
+ from urllib import urlretrieve
62
+ elif PYTHON_VERSION == 3:
63
+ from urllib.request import urlretrieve
64
+
65
+
66
+ def _isArrayLike(obj):
67
+ return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
68
+
69
+
70
+ class COCO:
71
+ def __init__(self, annotation_file=None, class_agnostic=False):
72
+ """
73
+ Constructor of Microsoft COCO helper class for reading and visualizing annotations.
74
+ :param annotation_file (str): location of annotation file
75
+ :param image_folder (str): location to the folder that hosts images.
76
+ :return:
77
+ """
78
+ # load dataset
79
+ self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
80
+ self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
81
+ if not annotation_file == None:
82
+ print('loading annotations into memory...')
83
+ tic = time.time()
84
+ dataset = json.load(open(annotation_file, 'r'))
85
+ assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
86
+ print('Done (t={:0.2f}s)'.format(time.time()- tic))
87
+ self.dataset = dataset
88
+ if class_agnostic:
89
+ self.dataset = self.to_agnostic(dataset)
90
+ else:
91
+ self.dataset = dataset
92
+ self.createIndex()
93
+
94
+ def to_agnostic(self,dataset):
95
+ # dataset["categories"] = ["supercategory": "thing", "id":1, "name": "thing"]
96
+ dataset["categories"] = [{"supercategory": "thing", "id":1, "name": "thing"}]
97
+ nums = len(dataset["annotations"])
98
+ for ii in range(nums):
99
+ dataset["annotations"][ii]["category_id"] = 1
100
+ return dataset
101
+
102
+ def createIndex(self):
103
+ # create index
104
+ print('creating index...')
105
+ anns, cats, imgs = {}, {}, {}
106
+ imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
107
+ if 'annotations' in self.dataset:
108
+ for ann in self.dataset['annotations']:
109
+ imgToAnns[ann['image_id']].append(ann)
110
+ anns[ann['id']] = ann
111
+
112
+ if 'images' in self.dataset:
113
+ for img in self.dataset['images']:
114
+ imgs[img['id']] = img
115
+
116
+ if 'categories' in self.dataset:
117
+ for cat in self.dataset['categories']:
118
+ cats[cat['id']] = cat
119
+
120
+ if 'annotations' in self.dataset and 'categories' in self.dataset:
121
+ for ann in self.dataset['annotations']:
122
+ catToImgs[ann['category_id']].append(ann['image_id'])
123
+
124
+ print('index created!')
125
+
126
+ # create class members
127
+ self.anns = anns
128
+ self.imgToAnns = imgToAnns
129
+ self.catToImgs = catToImgs
130
+ self.imgs = imgs
131
+ self.cats = cats
132
+
133
+ def info(self):
134
+ """
135
+ Print information about the annotation file.
136
+ :return:
137
+ """
138
+ for key, value in self.dataset['info'].items():
139
+ print('{}: {}'.format(key, value))
140
+
141
+ def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
142
+ """
143
+ Get ann ids that satisfy given filter conditions. default skips that filter
144
+ :param imgIds (int array) : get anns for given imgs
145
+ catIds (int array) : get anns for given cats
146
+ areaRng (float array) : get anns for given area range (e.g. [0 inf])
147
+ iscrowd (boolean) : get anns for given crowd label (False or True)
148
+ :return: ids (int array) : integer array of ann ids
149
+ """
150
+ imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
151
+ catIds = catIds if _isArrayLike(catIds) else [catIds]
152
+
153
+ if len(imgIds) == len(catIds) == len(areaRng) == 0:
154
+ anns = self.dataset['annotations']
155
+ else:
156
+ if not len(imgIds) == 0:
157
+ lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
158
+ anns = list(itertools.chain.from_iterable(lists))
159
+ else:
160
+ anns = self.dataset['annotations']
161
+ anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds]
162
+ anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
163
+ if not iscrowd == None:
164
+ ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
165
+ else:
166
+ ids = [ann['id'] for ann in anns]
167
+ return ids
168
+
169
+ def getCatIds(self, catNms=[], supNms=[], catIds=[]):
170
+ """
171
+ filtering parameters. default skips that filter.
172
+ :param catNms (str array) : get cats for given cat names
173
+ :param supNms (str array) : get cats for given supercategory names
174
+ :param catIds (int array) : get cats for given cat ids
175
+ :return: ids (int array) : integer array of cat ids
176
+ """
177
+ catNms = catNms if _isArrayLike(catNms) else [catNms]
178
+ supNms = supNms if _isArrayLike(supNms) else [supNms]
179
+ catIds = catIds if _isArrayLike(catIds) else [catIds]
180
+
181
+ if len(catNms) == len(supNms) == len(catIds) == 0:
182
+ cats = self.dataset['categories']
183
+ else:
184
+ cats = self.dataset['categories']
185
+ cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms]
186
+ cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
187
+ cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds]
188
+ ids = [cat['id'] for cat in cats]
189
+ return ids
190
+
191
+ def getImgIds(self, imgIds=[], catIds=[]):
192
+ '''
193
+ Get img ids that satisfy given filter conditions.
194
+ :param imgIds (int array) : get imgs for given ids
195
+ :param catIds (int array) : get imgs with all given cats
196
+ :return: ids (int array) : integer array of img ids
197
+ '''
198
+ imgIds = imgIds if _isArrayLike(imgIds) else [imgIds]
199
+ catIds = catIds if _isArrayLike(catIds) else [catIds]
200
+
201
+ if len(imgIds) == len(catIds) == 0:
202
+ ids = self.imgs.keys()
203
+ else:
204
+ ids = set(imgIds)
205
+ for i, catId in enumerate(catIds):
206
+ if i == 0 and len(ids) == 0:
207
+ ids = set(self.catToImgs[catId])
208
+ else:
209
+ ids &= set(self.catToImgs[catId])
210
+ return list(ids)
211
+
212
+ def loadAnns(self, ids=[]):
213
+ """
214
+ Load anns with the specified ids.
215
+ :param ids (int array) : integer ids specifying anns
216
+ :return: anns (object array) : loaded ann objects
217
+ """
218
+ if _isArrayLike(ids):
219
+ return [self.anns[id] for id in ids]
220
+ elif type(ids) == int:
221
+ return [self.anns[ids]]
222
+
223
+ def loadCats(self, ids=[]):
224
+ """
225
+ Load cats with the specified ids.
226
+ :param ids (int array) : integer ids specifying cats
227
+ :return: cats (object array) : loaded cat objects
228
+ """
229
+ if _isArrayLike(ids):
230
+ return [self.cats[id] for id in ids]
231
+ elif type(ids) == int:
232
+ return [self.cats[ids]]
233
+
234
+ def loadImgs(self, ids=[]):
235
+ """
236
+ Load anns with the specified ids.
237
+ :param ids (int array) : integer ids specifying img
238
+ :return: imgs (object array) : loaded img objects
239
+ """
240
+ if _isArrayLike(ids):
241
+ return [self.imgs[id] for id in ids]
242
+ elif type(ids) == int:
243
+ return [self.imgs[ids]]
244
+
245
+ def showAnns(self, anns, draw_bbox=False):
246
+ """
247
+ Display the specified annotations.
248
+ :param anns (array of object): annotations to display
249
+ :return: None
250
+ """
251
+ if len(anns) == 0:
252
+ return 0
253
+ if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
254
+ datasetType = 'instances'
255
+ elif 'caption' in anns[0]:
256
+ datasetType = 'captions'
257
+ else:
258
+ raise Exception('datasetType not supported')
259
+ if datasetType == 'instances':
260
+ ax = plt.gca()
261
+ ax.set_autoscale_on(False)
262
+ polygons = []
263
+ color = []
264
+ for ann in anns:
265
+ c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
266
+ if 'segmentation' in ann:
267
+ if type(ann['segmentation']) == list:
268
+ # polygon
269
+ for seg in ann['segmentation']:
270
+ poly = np.array(seg).reshape((int(len(seg)/2), 2))
271
+ polygons.append(Polygon(poly))
272
+ color.append(c)
273
+ else:
274
+ # mask
275
+ t = self.imgs[ann['image_id']]
276
+ if type(ann['segmentation']['counts']) == list:
277
+ rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width'])
278
+ else:
279
+ rle = [ann['segmentation']]
280
+ m = maskUtils.decode(rle)
281
+ img = np.ones( (m.shape[0], m.shape[1], 3) )
282
+ if ann['iscrowd'] == 1:
283
+ color_mask = np.array([2.0,166.0,101.0])/255
284
+ if ann['iscrowd'] == 0:
285
+ color_mask = np.random.random((1, 3)).tolist()[0]
286
+ for i in range(3):
287
+ img[:,:,i] = color_mask[i]
288
+ ax.imshow(np.dstack( (img, m*0.5) ))
289
+ if 'keypoints' in ann and type(ann['keypoints']) == list:
290
+ # turn skeleton into zero-based index
291
+ sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
292
+ kp = np.array(ann['keypoints'])
293
+ x = kp[0::3]
294
+ y = kp[1::3]
295
+ v = kp[2::3]
296
+ for sk in sks:
297
+ if np.all(v[sk]>0):
298
+ plt.plot(x[sk],y[sk], linewidth=3, color=c)
299
+ plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
300
+ plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
301
+
302
+ if draw_bbox:
303
+ [bbox_x, bbox_y, bbox_w, bbox_h] = ann['bbox']
304
+ poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
305
+ np_poly = np.array(poly).reshape((4,2))
306
+ polygons.append(Polygon(np_poly))
307
+ color.append(c)
308
+
309
+ p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
310
+ ax.add_collection(p)
311
+ p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
312
+ ax.add_collection(p)
313
+ elif datasetType == 'captions':
314
+ for ann in anns:
315
+ print(ann['caption'])
316
+
317
+ def loadRes(self, resFile):
318
+ """
319
+ Load result file and return a result api object.
320
+ :param resFile (str) : file name of result file
321
+ :return: res (obj) : result api object
322
+ """
323
+ res = COCO()
324
+ res.dataset['images'] = [img for img in self.dataset['images']]
325
+
326
+ print('Loading and preparing results...')
327
+ tic = time.time()
328
+ if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode):
329
+ anns = json.load(open(resFile))
330
+ elif type(resFile) == np.ndarray:
331
+ anns = self.loadNumpyAnnotations(resFile)
332
+ else:
333
+ anns = resFile
334
+ assert type(anns) == list, 'results in not an array of objects'
335
+ annsImgIds = [ann['image_id'] for ann in anns]
336
+ assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
337
+ 'Results do not correspond to current coco set'
338
+ if 'caption' in anns[0]:
339
+ imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
340
+ res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
341
+ for id, ann in enumerate(anns):
342
+ ann['id'] = id+1
343
+ elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
344
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
345
+ for id, ann in enumerate(anns):
346
+ bb = ann['bbox']
347
+ x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
348
+ if not 'segmentation' in ann:
349
+ ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
350
+ ann['area'] = bb[2]*bb[3]
351
+ ann['id'] = id+1
352
+ ann['iscrowd'] = 0
353
+ elif 'segmentation' in anns[0]:
354
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
355
+ for id, ann in enumerate(anns):
356
+ # now only support compressed RLE format as segmentation results
357
+ ann['area'] = maskUtils.area(ann['segmentation'])
358
+ if not 'bbox' in ann:
359
+ ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
360
+ ann['id'] = id+1
361
+ ann['iscrowd'] = 0
362
+ elif 'keypoints' in anns[0]:
363
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
364
+ for id, ann in enumerate(anns):
365
+ s = ann['keypoints']
366
+ x = s[0::3]
367
+ y = s[1::3]
368
+ x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
369
+ ann['area'] = (x1-x0)*(y1-y0)
370
+ ann['id'] = id + 1
371
+ ann['bbox'] = [x0,y0,x1-x0,y1-y0]
372
+ print('DONE (t={:0.2f}s)'.format(time.time()- tic))
373
+
374
+ res.dataset['annotations'] = anns
375
+ res.createIndex()
376
+ return res
377
+
378
+ def download(self, tarDir = None, imgIds = [] ):
379
+ '''
380
+ Download COCO images from mscoco.org server.
381
+ :param tarDir (str): COCO results directory name
382
+ imgIds (list): images to be downloaded
383
+ :return:
384
+ '''
385
+ if tarDir is None:
386
+ print('Please specify target directory')
387
+ return -1
388
+ if len(imgIds) == 0:
389
+ imgs = self.imgs.values()
390
+ else:
391
+ imgs = self.loadImgs(imgIds)
392
+ N = len(imgs)
393
+ if not os.path.exists(tarDir):
394
+ os.makedirs(tarDir)
395
+ for i, img in enumerate(imgs):
396
+ tic = time.time()
397
+ fname = os.path.join(tarDir, img['file_name'])
398
+ if not os.path.exists(fname):
399
+ urlretrieve(img['coco_url'], fname)
400
+ print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
401
+
402
+ def loadNumpyAnnotations(self, data):
403
+ """
404
+ Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
405
+ :param data (numpy.ndarray)
406
+ :return: annotations (python nested list)
407
+ """
408
+ print('Converting ndarray to lists...')
409
+ assert(type(data) == np.ndarray)
410
+ print(data.shape)
411
+ assert(data.shape[1] == 7)
412
+ N = data.shape[0]
413
+ ann = []
414
+ for i in range(N):
415
+ if i % 1000000 == 0:
416
+ print('{}/{}'.format(i,N))
417
+ ann += [{
418
+ 'image_id' : int(data[i, 0]),
419
+ 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
420
+ 'score' : data[i, 5],
421
+ 'category_id': int(data[i, 6]),
422
+ }]
423
+ return ann
424
+
425
+ def annToRLE(self, ann):
426
+ """
427
+ Convert annotation which can be polygons, uncompressed RLE to RLE.
428
+ :return: binary mask (numpy 2D array)
429
+ """
430
+ t = self.imgs[ann['image_id']]
431
+ h, w = t['height'], t['width']
432
+ segm = ann['segmentation']
433
+ if type(segm) == list:
434
+ # polygon -- a single object might consist of multiple parts
435
+ # we merge all parts into one mask rle code
436
+ rles = maskUtils.frPyObjects(segm, h, w)
437
+ rle = maskUtils.merge(rles)
438
+ elif type(segm['counts']) == list:
439
+ # uncompressed RLE
440
+ rle = maskUtils.frPyObjects(segm, h, w)
441
+ else:
442
+ # rle
443
+ rle = ann['segmentation']
444
+ return rle
445
+
446
+ def annToMask(self, ann):
447
+ """
448
+ Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
449
+ :return: binary mask (numpy 2D array)
450
+ """
451
+ rle = self.annToRLE(ann)
452
+ m = maskUtils.decode(rle)
453
+ return m
MaskClustering/third_party/Entity/Entity/EntitySeg/modified_cocoapi/PythonAPI/pycocotools/cocoeval.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __author__ = 'tsungyi'
2
+
3
+ import numpy as np
4
+ import datetime
5
+ import time
6
+ from collections import defaultdict
7
+ from . import mask as maskUtils
8
+ import copy
9
+
10
+ class COCOeval:
11
+ # Interface for evaluating detection on the Microsoft COCO dataset.
12
+ #
13
+ # The usage for CocoEval is as follows:
14
+ # cocoGt=..., cocoDt=... # load dataset and results
15
+ # E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object
16
+ # E.params.recThrs = ...; # set parameters as desired
17
+ # E.evaluate(); # run per image evaluation
18
+ # E.accumulate(); # accumulate per image results
19
+ # E.summarize(); # display summary metrics of results
20
+ # For example usage see evalDemo.m and http://mscoco.org/.
21
+ #
22
+ # The evaluation parameters are as follows (defaults in brackets):
23
+ # imgIds - [all] N img ids to use for evaluation
24
+ # catIds - [all] K cat ids to use for evaluation
25
+ # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation
26
+ # recThrs - [0:.01:1] R=101 recall thresholds for evaluation
27
+ # areaRng - [...] A=4 object area ranges for evaluation
28
+ # maxDets - [1 10 100] M=3 thresholds on max detections per image
29
+ # iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
30
+ # iouType replaced the now DEPRECATED useSegm parameter.
31
+ # useCats - [1] if true use category labels for evaluation
32
+ # Note: if useCats=0 category labels are ignored as in proposal scoring.
33
+ # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
34
+ #
35
+ # evaluate(): evaluates detections on every image and every category and
36
+ # concats the results into the "evalImgs" with fields:
37
+ # dtIds - [1xD] id for each of the D detections (dt)
38
+ # gtIds - [1xG] id for each of the G ground truths (gt)
39
+ # dtMatches - [TxD] matching gt id at each IoU or 0
40
+ # gtMatches - [TxG] matching dt id at each IoU or 0
41
+ # dtScores - [1xD] confidence of each dt
42
+ # gtIgnore - [1xG] ignore flag for each gt
43
+ # dtIgnore - [TxD] ignore flag for each dt at each IoU
44
+ #
45
+ # accumulate(): accumulates the per-image, per-category evaluation
46
+ # results in "evalImgs" into the dictionary "eval" with fields:
47
+ # params - parameters used for evaluation
48
+ # date - date evaluation was performed
49
+ # counts - [T,R,K,A,M] parameter dimensions (see above)
50
+ # precision - [TxRxKxAxM] precision for every evaluation setting
51
+ # recall - [TxKxAxM] max recall for every evaluation setting
52
+ # Note: precision and recall==-1 for settings with no gt objects.
53
+ #
54
+ # See also coco, mask, pycocoDemo, pycocoEvalDemo
55
+ #
56
+ # Microsoft COCO Toolbox. version 2.0
57
+ # Data, paper, and tutorials available at: http://mscoco.org/
58
+ # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
59
+ # Licensed under the Simplified BSD License [see coco/license.txt]
60
+ def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
61
+ '''
62
+ Initialize CocoEval using coco APIs for gt and dt
63
+ :param cocoGt: coco object with ground truth annotations
64
+ :param cocoDt: coco object with detection results
65
+ :return: None
66
+ '''
67
+ if not iouType:
68
+ print('iouType not specified. use default iouType segm')
69
+ self.cocoGt = cocoGt # ground truth COCO API
70
+ self.cocoDt = cocoDt # detections COCO API
71
+ self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements
72
+ self.eval = {} # accumulated evaluation results
73
+ self._gts = defaultdict(list) # gt for evaluation
74
+ self._dts = defaultdict(list) # dt for evaluation
75
+ self.params = Params(iouType=iouType) # parameters
76
+ self._paramsEval = {} # parameters for evaluation
77
+ self.stats = [] # result summarization
78
+ self.ious = {} # ious between all gts and dts
79
+ if not cocoGt is None:
80
+ self.params.imgIds = sorted(cocoGt.getImgIds())
81
+ self.params.catIds = sorted(cocoGt.getCatIds())
82
+
83
+
84
+ def _prepare(self):
85
+ '''
86
+ Prepare ._gts and ._dts for evaluation based on params
87
+ :return: None
88
+ '''
89
+ def _toMask(anns, coco):
90
+ # modify ann['segmentation'] by reference
91
+ for ann in anns:
92
+ rle = coco.annToRLE(ann)
93
+ ann['segmentation'] = rle
94
+ p = self.params
95
+ if p.useCats:
96
+ gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
97
+ dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
98
+ else:
99
+ gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
100
+ dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
101
+
102
+ # convert ground truth to mask if iouType == 'segm'
103
+ if p.iouType == 'segm':
104
+ _toMask(gts, self.cocoGt)
105
+ _toMask(dts, self.cocoDt)
106
+ # set ignore flag
107
+ for gt in gts:
108
+ gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
109
+ gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
110
+ if p.iouType == 'keypoints':
111
+ gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
112
+ self._gts = defaultdict(list) # gt for evaluation
113
+ self._dts = defaultdict(list) # dt for evaluation
114
+ for gt in gts:
115
+ self._gts[gt['image_id'], gt['category_id']].append(gt)
116
+ for dt in dts:
117
+ self._dts[dt['image_id'], dt['category_id']].append(dt)
118
+ self.evalImgs = defaultdict(list) # per-image per-category evaluation results
119
+ self.eval = {} # accumulated evaluation results
120
+
121
+ def evaluate(self):
122
+ '''
123
+ Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
124
+ :return: None
125
+ '''
126
+ tic = time.time()
127
+ print('Running per image evaluation...')
128
+ p = self.params
129
+ # add backward compatibility if useSegm is specified in params
130
+ if not p.useSegm is None:
131
+ p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
132
+ print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
133
+ print('Evaluate annotation type *{}*'.format(p.iouType))
134
+ p.imgIds = list(np.unique(p.imgIds))
135
+ if p.useCats:
136
+ p.catIds = list(np.unique(p.catIds))
137
+ p.maxDets = sorted(p.maxDets)
138
+ self.params=p
139
+
140
+ self._prepare()
141
+ # loop through images, area range, max detection number
142
+ catIds = p.catIds if p.useCats else [-1]
143
+
144
+ if p.iouType == 'segm' or p.iouType == 'bbox':
145
+ computeIoU = self.computeIoU
146
+ elif p.iouType == 'keypoints':
147
+ computeIoU = self.computeOks
148
+ self.ious = {(imgId, catId): computeIoU(imgId, catId) \
149
+ for imgId in p.imgIds
150
+ for catId in catIds}
151
+
152
+ evaluateImg = self.evaluateImg
153
+ maxDet = p.maxDets[-1]
154
+ self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet)
155
+ for catId in catIds
156
+ for areaRng in p.areaRng
157
+ for imgId in p.imgIds
158
+ ]
159
+ self._paramsEval = copy.deepcopy(self.params)
160
+ toc = time.time()
161
+ print('DONE (t={:0.2f}s).'.format(toc-tic))
162
+
163
+ def computeIoU(self, imgId, catId):
164
+ p = self.params
165
+ if p.useCats:
166
+ gt = self._gts[imgId,catId]
167
+ dt = self._dts[imgId,catId]
168
+ else:
169
+ gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
170
+ dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
171
+ if len(gt) == 0 and len(dt) ==0:
172
+ return []
173
+ inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
174
+ dt = [dt[i] for i in inds]
175
+ if len(dt) > p.maxDets[-1]:
176
+ dt=dt[0:p.maxDets[-1]]
177
+
178
+ if p.iouType == 'segm':
179
+ g = [g['segmentation'] for g in gt]
180
+ d = [d['segmentation'] for d in dt]
181
+ elif p.iouType == 'bbox':
182
+ g = [g['bbox'] for g in gt]
183
+ d = [d['bbox'] for d in dt]
184
+ else:
185
+ raise Exception('unknown iouType for iou computation')
186
+
187
+ # compute iou between each dt and gt region
188
+ iscrowd = [int(o['iscrowd']) for o in gt]
189
+ ious = maskUtils.iou(d,g,iscrowd)
190
+ return ious
191
+
192
+ def computeOks(self, imgId, catId):
193
+ p = self.params
194
+ # dimention here should be Nxm
195
+ gts = self._gts[imgId, catId]
196
+ dts = self._dts[imgId, catId]
197
+ inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
198
+ dts = [dts[i] for i in inds]
199
+ if len(dts) > p.maxDets[-1]:
200
+ dts = dts[0:p.maxDets[-1]]
201
+ # if len(gts) == 0 and len(dts) == 0:
202
+ if len(gts) == 0 or len(dts) == 0:
203
+ return []
204
+ ious = np.zeros((len(dts), len(gts)))
205
+ sigmas = p.kpt_oks_sigmas
206
+ vars = (sigmas * 2)**2
207
+ k = len(sigmas)
208
+ # compute oks between each detection and ground truth object
209
+ for j, gt in enumerate(gts):
210
+ # create bounds for ignore regions(double the gt bbox)
211
+ g = np.array(gt['keypoints'])
212
+ xg = g[0::3]; yg = g[1::3]; vg = g[2::3]
213
+ k1 = np.count_nonzero(vg > 0)
214
+ bb = gt['bbox']
215
+ x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2
216
+ y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2
217
+ for i, dt in enumerate(dts):
218
+ d = np.array(dt['keypoints'])
219
+ xd = d[0::3]; yd = d[1::3]
220
+ if k1>0:
221
+ # measure the per-keypoint distance if keypoints visible
222
+ dx = xd - xg
223
+ dy = yd - yg
224
+ else:
225
+ # measure minimum distance to keypoints in (x0,y0) & (x1,y1)
226
+ z = np.zeros((k))
227
+ dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0)
228
+ dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0)
229
+ e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2
230
+ if k1 > 0:
231
+ e=e[vg > 0]
232
+ ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
233
+ return ious
234
+
235
+ def evaluateImg(self, imgId, catId, aRng, maxDet):
236
+ '''
237
+ perform evaluation for single category and image
238
+ :return: dict (single image results)
239
+ '''
240
+ p = self.params
241
+ if p.useCats:
242
+ gt = self._gts[imgId,catId]
243
+ dt = self._dts[imgId,catId]
244
+ else:
245
+ gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]]
246
+ dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]]
247
+ if len(gt) == 0 and len(dt) ==0:
248
+ return None
249
+
250
+ for g in gt:
251
+ if g['ignore'] or (g['area']<aRng[0] or g['area']>aRng[1]):
252
+ g['_ignore'] = 1
253
+ else:
254
+ g['_ignore'] = 0
255
+
256
+ # sort dt highest score first, sort gt ignore last
257
+ gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
258
+ gt = [gt[i] for i in gtind]
259
+ dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
260
+ dt = [dt[i] for i in dtind[0:maxDet]]
261
+ iscrowd = [int(o['iscrowd']) for o in gt]
262
+ # load computed ious
263
+ ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId]
264
+
265
+ T = len(p.iouThrs)
266
+ G = len(gt)
267
+ D = len(dt)
268
+ gtm = np.zeros((T,G))
269
+ dtm = np.zeros((T,D))
270
+ gtIg = np.array([g['_ignore'] for g in gt])
271
+ dtIg = np.zeros((T,D))
272
+ if not len(ious)==0:
273
+ for tind, t in enumerate(p.iouThrs):
274
+ for dind, d in enumerate(dt):
275
+ # information about best match so far (m=-1 -> unmatched)
276
+ iou = min([t,1-1e-10])
277
+ m = -1
278
+ for gind, g in enumerate(gt):
279
+ # if this gt already matched, and not a crowd, continue
280
+ if gtm[tind,gind]>0 and not iscrowd[gind]:
281
+ continue
282
+ # if dt matched to reg gt, and on ignore gt, stop
283
+ if m>-1 and gtIg[m]==0 and gtIg[gind]==1:
284
+ break
285
+ # continue to next gt unless better match made
286
+ if ious[dind,gind] < iou:
287
+ continue
288
+ # if match successful and best so far, store appropriately
289
+ iou=ious[dind,gind]
290
+ m=gind
291
+ # if match made store id of match for both dt and gt
292
+ if m ==-1:
293
+ continue
294
+ dtIg[tind,dind] = gtIg[m]
295
+ dtm[tind,dind] = gt[m]['id']
296
+ gtm[tind,m] = d['id']
297
+ # set unmatched detections outside of area range to ignore
298
+ a = np.array([d['area']<aRng[0] or d['area']>aRng[1] for d in dt]).reshape((1, len(dt)))
299
+ dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0)))
300
+ # store results for given image and category
301
+ return {
302
+ 'image_id': imgId,
303
+ 'category_id': catId,
304
+ 'aRng': aRng,
305
+ 'maxDet': maxDet,
306
+ 'dtIds': [d['id'] for d in dt],
307
+ 'gtIds': [g['id'] for g in gt],
308
+ 'dtMatches': dtm,
309
+ 'gtMatches': gtm,
310
+ 'dtScores': [d['score'] for d in dt],
311
+ 'gtIgnore': gtIg,
312
+ 'dtIgnore': dtIg,
313
+ }
314
+
315
+ def accumulate(self, p = None):
316
+ '''
317
+ Accumulate per image evaluation results and store the result in self.eval
318
+ :param p: input params for evaluation
319
+ :return: None
320
+ '''
321
+ print('Accumulating evaluation results...')
322
+ tic = time.time()
323
+ if not self.evalImgs:
324
+ print('Please run evaluate() first')
325
+ # allows input customized parameters
326
+ if p is None:
327
+ p = self.params
328
+ p.catIds = p.catIds if p.useCats == 1 else [-1]
329
+ T = len(p.iouThrs)
330
+ R = len(p.recThrs)
331
+ K = len(p.catIds) if p.useCats else 1
332
+ A = len(p.areaRng)
333
+ M = len(p.maxDets)
334
+ precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
335
+ recall = -np.ones((T,K,A,M))
336
+ scores = -np.ones((T,R,K,A,M))
337
+
338
+ # create dictionary for future indexing
339
+ _pe = self._paramsEval
340
+ catIds = _pe.catIds if _pe.useCats else [-1]
341
+ setK = set(catIds)
342
+ setA = set(map(tuple, _pe.areaRng))
343
+ setM = set(_pe.maxDets)
344
+ setI = set(_pe.imgIds)
345
+ # get inds to evaluate
346
+ k_list = [n for n, k in enumerate(p.catIds) if k in setK]
347
+ m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
348
+ a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
349
+ i_list = [n for n, i in enumerate(p.imgIds) if i in setI]
350
+ I0 = len(_pe.imgIds)
351
+ A0 = len(_pe.areaRng)
352
+ # retrieve E at each category, area range, and max number of detections
353
+ for k, k0 in enumerate(k_list):
354
+ Nk = k0*A0*I0
355
+ for a, a0 in enumerate(a_list):
356
+ Na = a0*I0
357
+ for m, maxDet in enumerate(m_list):
358
+ E = [self.evalImgs[Nk + Na + i] for i in i_list]
359
+ E = [e for e in E if not e is None]
360
+ if len(E) == 0:
361
+ continue
362
+ dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
363
+
364
+ # different sorting method generates slightly different results.
365
+ # mergesort is used to be consistent as Matlab implementation.
366
+ inds = np.argsort(-dtScores, kind='mergesort')
367
+ dtScoresSorted = dtScores[inds]
368
+
369
+ dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
370
+ dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds]
371
+ gtIg = np.concatenate([e['gtIgnore'] for e in E])
372
+ npig = np.count_nonzero(gtIg==0 )
373
+ if npig == 0:
374
+ continue
375
+ tps = np.logical_and( dtm, np.logical_not(dtIg) )
376
+ fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
377
+
378
+ tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
379
+ fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
380
+ for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
381
+ tp = np.array(tp)
382
+ fp = np.array(fp)
383
+ nd = len(tp)
384
+ rc = tp / npig
385
+ pr = tp / (fp+tp+np.spacing(1))
386
+ q = np.zeros((R,))
387
+ ss = np.zeros((R,))
388
+
389
+ if nd:
390
+ recall[t,k,a,m] = rc[-1]
391
+ else:
392
+ recall[t,k,a,m] = 0
393
+
394
+ # numpy is slow without cython optimization for accessing elements
395
+ # use python array gets significant speed improvement
396
+ pr = pr.tolist(); q = q.tolist()
397
+
398
+ for i in range(nd-1, 0, -1):
399
+ if pr[i] > pr[i-1]:
400
+ pr[i-1] = pr[i]
401
+
402
+ inds = np.searchsorted(rc, p.recThrs, side='left')
403
+ try:
404
+ for ri, pi in enumerate(inds):
405
+ q[ri] = pr[pi]
406
+ ss[ri] = dtScoresSorted[pi]
407
+ except:
408
+ pass
409
+ precision[t,:,k,a,m] = np.array(q)
410
+ scores[t,:,k,a,m] = np.array(ss)
411
+ self.eval = {
412
+ 'params': p,
413
+ 'counts': [T, R, K, A, M],
414
+ 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
415
+ 'precision': precision,
416
+ 'recall': recall,
417
+ 'scores': scores,
418
+ }
419
+ toc = time.time()
420
+ print('DONE (t={:0.2f}s).'.format( toc-tic))
421
+
422
+ def summarize(self):
423
+ '''
424
+ Compute and display summary metrics for evaluation results.
425
+ Note this functin can *only* be applied on the default parameter setting
426
+ '''
427
+ def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
428
+ p = self.params
429
+ iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'
430
+ titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
431
+ typeStr = '(AP)' if ap==1 else '(AR)'
432
+ iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
433
+ if iouThr is None else '{:0.2f}'.format(iouThr)
434
+
435
+ aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
436
+ mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
437
+ if ap == 1:
438
+ # dimension of precision: [TxRxKxAxM]
439
+ s = self.eval['precision']
440
+ # IoU
441
+ if iouThr is not None:
442
+ t = np.where(iouThr == p.iouThrs)[0]
443
+ s = s[t]
444
+ s = s[:,:,:,aind,mind]
445
+ else:
446
+ # dimension of recall: [TxKxAxM]
447
+ s = self.eval['recall']
448
+ if iouThr is not None:
449
+ t = np.where(iouThr == p.iouThrs)[0]
450
+ s = s[t]
451
+ s = s[:,:,aind,mind]
452
+ if len(s[s>-1])==0:
453
+ mean_s = -1
454
+ else:
455
+ mean_s = np.mean(s[s>-1])
456
+ print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
457
+ return mean_s
458
+ def _summarizeDets():
459
+ stats = np.zeros((12,))
460
+ stats[0] = _summarize(1)
461
+ stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
462
+ stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2])
463
+ stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2])
464
+ stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2])
465
+ stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2])
466
+ stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
467
+ stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
468
+ stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
469
+ stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2])
470
+ stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2])
471
+ stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2])
472
+ return stats
473
+ def _summarizeKps():
474
+ stats = np.zeros((10,))
475
+ stats[0] = _summarize(1, maxDets=20)
476
+ stats[1] = _summarize(1, maxDets=20, iouThr=.5)
477
+ stats[2] = _summarize(1, maxDets=20, iouThr=.75)
478
+ stats[3] = _summarize(1, maxDets=20, areaRng='medium')
479
+ stats[4] = _summarize(1, maxDets=20, areaRng='large')
480
+ stats[5] = _summarize(0, maxDets=20)
481
+ stats[6] = _summarize(0, maxDets=20, iouThr=.5)
482
+ stats[7] = _summarize(0, maxDets=20, iouThr=.75)
483
+ stats[8] = _summarize(0, maxDets=20, areaRng='medium')
484
+ stats[9] = _summarize(0, maxDets=20, areaRng='large')
485
+ return stats
486
+ if not self.eval:
487
+ raise Exception('Please run accumulate() first')
488
+ iouType = self.params.iouType
489
+ if iouType == 'segm' or iouType == 'bbox':
490
+ summarize = _summarizeDets
491
+ elif iouType == 'keypoints':
492
+ summarize = _summarizeKps
493
+ self.stats = summarize()
494
+
495
+ def __str__(self):
496
+ self.summarize()
497
+
498
+ class Params:
499
+ '''
500
+ Params for coco evaluation api
501
+ '''
502
+ def setDetParams(self):
503
+ self.imgIds = []
504
+ self.catIds = []
505
+ # np.arange causes trouble. the data point on arange is slightly larger than the true value
506
+ self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
507
+ self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
508
+ self.maxDets = [1, 10, 100]
509
+ self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
510
+ self.areaRngLbl = ['all', 'small', 'medium', 'large']
511
+ self.useCats = 1
512
+
513
+ def setKpParams(self):
514
+ self.imgIds = []
515
+ self.catIds = []
516
+ # np.arange causes trouble. the data point on arange is slightly larger than the true value
517
+ self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
518
+ self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
519
+ self.maxDets = [20]
520
+ self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
521
+ self.areaRngLbl = ['all', 'medium', 'large']
522
+ self.useCats = 1
523
+ self.kpt_oks_sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0
524
+
525
+ def __init__(self, iouType='segm'):
526
+ if iouType == 'segm' or iouType == 'bbox':
527
+ self.setDetParams()
528
+ elif iouType == 'keypoints':
529
+ self.setKpParams()
530
+ else:
531
+ raise Exception('iouType not supported')
532
+ self.iouType = iouType
533
+ # useSegm is deprecated
534
+ self.useSegm = None