Benrise commited on
Commit
9b63413
·
1 Parent(s): 39ecbc6

Add VITON implementation with UI

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. .gitignore +2 -0
  3. README.md +28 -8
  4. app.py +150 -0
  5. configs/VITONHD.yaml +32 -0
  6. lib/caption.py +19 -0
  7. lib/mask.py +64 -0
  8. lib/pose.py +36 -0
  9. preprocess/__init__.py +0 -0
  10. preprocess/humanparsing/__init__.py +0 -0
  11. preprocess/humanparsing/datasets/__init__.py +0 -0
  12. preprocess/humanparsing/datasets/datasets.py +201 -0
  13. preprocess/humanparsing/datasets/simple_extractor_dataset.py +89 -0
  14. preprocess/humanparsing/datasets/target_generation.py +40 -0
  15. preprocess/humanparsing/modules/__init__.py +5 -0
  16. preprocess/humanparsing/modules/bn.py +132 -0
  17. preprocess/humanparsing/modules/deeplab.py +84 -0
  18. preprocess/humanparsing/modules/dense.py +42 -0
  19. preprocess/humanparsing/modules/functions.py +245 -0
  20. preprocess/humanparsing/modules/misc.py +21 -0
  21. preprocess/humanparsing/modules/residual.py +182 -0
  22. preprocess/humanparsing/modules/src/checks.h +15 -0
  23. preprocess/humanparsing/modules/src/inplace_abn.cpp +95 -0
  24. preprocess/humanparsing/modules/src/inplace_abn.h +88 -0
  25. preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp +119 -0
  26. preprocess/humanparsing/modules/src/inplace_abn_cuda.cu +333 -0
  27. preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu +275 -0
  28. preprocess/humanparsing/modules/src/utils/checks.h +15 -0
  29. preprocess/humanparsing/modules/src/utils/common.h +49 -0
  30. preprocess/humanparsing/modules/src/utils/cuda.cuh +71 -0
  31. preprocess/humanparsing/networks/AugmentCE2P.py +388 -0
  32. preprocess/humanparsing/networks/__init__.py +12 -0
  33. preprocess/humanparsing/networks/backbone/mobilenetv2.py +156 -0
  34. preprocess/humanparsing/networks/backbone/resnet.py +205 -0
  35. preprocess/humanparsing/networks/backbone/resnext.py +149 -0
  36. preprocess/humanparsing/networks/context_encoding/aspp.py +64 -0
  37. preprocess/humanparsing/networks/context_encoding/ocnet.py +226 -0
  38. preprocess/humanparsing/networks/context_encoding/psp.py +48 -0
  39. preprocess/humanparsing/parsing_api.py +191 -0
  40. preprocess/humanparsing/run_parsing.py +44 -0
  41. preprocess/humanparsing/utils/__init__.py +0 -0
  42. preprocess/humanparsing/utils/consistency_loss.py +33 -0
  43. preprocess/humanparsing/utils/criterion.py +142 -0
  44. preprocess/humanparsing/utils/encoding.py +187 -0
  45. preprocess/humanparsing/utils/kl_loss.py +43 -0
  46. preprocess/humanparsing/utils/lovasz_softmax.py +279 -0
  47. preprocess/humanparsing/utils/miou.py +155 -0
  48. preprocess/humanparsing/utils/schp.py +80 -0
  49. preprocess/humanparsing/utils/soft_dice_loss.py +111 -0
  50. preprocess/humanparsing/utils/transforms.py +167 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.jpg filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ checkpoints
README.md CHANGED
@@ -1,14 +1,34 @@
1
  ---
2
- title: VITON HD
3
- emoji: 🌍
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.34.2
8
  app_file: app.py
9
- pinned: false
10
- license: cc-by-nc-sa-4.0
11
- short_description: Virtual try-on
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Virtual Try-On
3
+ emoji: 👗
4
+ colorFrom: pink
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.34.2
8
  app_file: app.py
9
+ pinned: true
 
 
10
  ---
11
 
12
+ # Virtual Try-On Demo
13
+ This repository is the work demo implementation of [PromptDresser](https://arxiv.org/abs/2412.16978)
14
+
15
+ > **PromptDresser: Improving the Quality and Controllability of Virtual Try-On via Generative Textual Prompt and Prompt-aware Mask**<br>
16
+ > [Jeongho Kim](https://scholar.google.co.kr/citations?user=4SCCBFwAAAAJ&hl=ko), [Hoiyeong Jin](https://scholar.google.com/citations?user=Jp-zhtUAAAAJ&hl=en), [Sunghyun Park](https://psh01087.github.io/), [Jaegul Choo](https://sites.google.com/site/jaegulchoo/)
17
+
18
+ [[arXiv Paper](https://arxiv.org/abs/2412.16978)]&nbsp;
19
+
20
+ ## Citation
21
+ ```
22
+ @misc{kim2024promptdresserimprovingqualitycontrollability,
23
+ title={PromptDresser: Improving the Quality and Controllability of Virtual Try-On via Generative Textual Prompt and Prompt-aware Mask},
24
+ author={Jeongho Kim and Hoiyeong Jin and Sunghyun Park and Jaegul Choo},
25
+ year={2024},
26
+ eprint={2412.16978},
27
+ archivePrefix={arXiv},
28
+ primaryClass={cs.CV},
29
+ url={https://arxiv.org/abs/2412.16978},
30
+ }
31
+ ```
32
+
33
+ ## License
34
+ Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).
app.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import gradio as gr
4
+ import tempfile
5
+ from huggingface_hub import hf_hub_download
6
+ from diffusers import AutoencoderKL, DDPMScheduler
7
+ from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
8
+
9
+ from promptdresser.models.unet import UNet2DConditionModel
10
+ from promptdresser.models.cloth_encoder import ClothEncoder
11
+ from promptdresser.pipelines.sdxl import PromptDresser
12
+ from lib.caption import generate_caption
13
+ from lib.mask import generate_clothing_mask
14
+ from lib.pose import generate_openpose
15
+
16
+
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ weight_dtype = torch.float16 if device == "cuda" else torch.float32
19
+
20
+ def load_models():
21
+ print("⚙️ Загрузка моделей...")
22
+
23
+ noise_scheduler = DDPMScheduler.from_pretrained(
24
+ "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
25
+ subfolder="scheduler"
26
+ )
27
+ tokenizer = CLIPTokenizer.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="tokenizer")
28
+ text_encoder = CLIPTextModel.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="text_encoder")
29
+ tokenizer_2 = CLIPTokenizer.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="tokenizer_2")
30
+ text_encoder_2 = CLIPTextModelWithProjection.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="text_encoder_2")
31
+ vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix")
32
+ unet = UNet2DConditionModel.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="unet")
33
+ cloth_encoder = ClothEncoder.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet")
34
+
35
+ unet_checkpoint_path = hf_hub_download(
36
+ repo_id="Benrise/VITON-HD",
37
+ filename="VITONHD/model/pytorch_model.bin",
38
+ cache_dir="checkpoints"
39
+ )
40
+ unet.load_state_dict(torch.load(unet_checkpoint_path))
41
+
42
+ models = {
43
+ "unet": unet.to(device, dtype=weight_dtype),
44
+ "vae": vae.to(device, dtype=weight_dtype),
45
+ "text_encoder": text_encoder.to(device, dtype=weight_dtype),
46
+ "text_encoder_2": text_encoder_2.to(device, dtype=weight_dtype),
47
+ "cloth_encoder": cloth_encoder.to(device, dtype=weight_dtype),
48
+ "noise_scheduler": noise_scheduler,
49
+ "tokenizer": tokenizer,
50
+ "tokenizer_2": tokenizer_2
51
+ }
52
+
53
+ pipeline = PromptDresser(
54
+ vae=models["vae"],
55
+ text_encoder=models["text_encoder"],
56
+ text_encoder_2=models["text_encoder_2"],
57
+ tokenizer=models["tokenizer"],
58
+ tokenizer_2=models["tokenizer_2"],
59
+ unet=models["unet"],
60
+ scheduler=models["noise_scheduler"],
61
+ ).to(device, dtype=weight_dtype)
62
+
63
+ return {**models, "pipeline": pipeline}
64
+
65
+ models = load_models()
66
+ pipeline = models["pipeline"]
67
+
68
+ def generate_vton(person_image, cloth_image, outfit_prompt="", clothing_prompt=""):
69
+ with tempfile.TemporaryDirectory() as tmp_dir:
70
+ person_path = os.path.join(tmp_dir, "person.png")
71
+ cloth_path = os.path.join(tmp_dir, "cloth.png")
72
+
73
+ person_image.save(person_path)
74
+ cloth_image.save(cloth_path)
75
+
76
+ mask_path = os.path.join(tmp_dir, "mask.png")
77
+ pose_path = os.path.join(tmp_dir, "pose.png")
78
+
79
+ mask_image = generate_clothing_mask(person_path, label=4, output_path=mask_path, show_result=False)
80
+ pose_image = generate_openpose(person_path, output_image_path=pose_path, show_result=False)
81
+
82
+ auto_outfit_prompt = generate_caption(person_path, device)
83
+ auto_clothing_prompt = generate_caption(cloth_path, device)
84
+
85
+ final_outfit_prompt = outfit_prompt or auto_outfit_prompt
86
+ final_clothing_prompt = clothing_prompt or auto_clothing_prompt
87
+
88
+ with torch.autocast(device):
89
+ result = pipeline(
90
+ image=person_image,
91
+ mask_image=mask_image,
92
+ pose_image=pose_image,
93
+ cloth_encoder=models["cloth_encoder"],
94
+ cloth_encoder_image=cloth_image,
95
+ prompt=final_outfit_prompt,
96
+ prompt_clothing=final_clothing_prompt,
97
+ height=1024,
98
+ width=768,
99
+ guidance_scale=2.0,
100
+ guidance_scale_img=4.5,
101
+ guidance_scale_text=7.5,
102
+ num_inference_steps=30,
103
+ strength=1,
104
+ interm_cloth_start_ratio=0.5,
105
+ generator=None,
106
+ ).images[0]
107
+
108
+ return result
109
+
110
+ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container") as demo:
111
+ gr.Markdown("# 🧥 Virtual Try-On")
112
+ gr.Markdown("Загрузите фото человека и одежды для виртуальной примерки")
113
+
114
+ with gr.Row():
115
+ with gr.Column():
116
+ person_input = gr.Image(label="Фото человека", type="pil", sources=["upload"])
117
+ cloth_input = gr.Image(label="Фото одежды", type="pil", sources=["upload"])
118
+ outfit_prompt = gr.Textbox(label="Описание образа (опционально)", placeholder="Например: man in casual outfit")
119
+ clothing_prompt = gr.Textbox(label="Описание одежды (опционально)", placeholder="Например: red t-shirt with print")
120
+ generate_btn = gr.Button("Сгенерировать примерку", variant="primary")
121
+
122
+ gr.Examples(
123
+ examples=[
124
+ ["./test/person2.png", "./test/00008_00.jpg", "man in skirt", "black longsleeve"]
125
+ ],
126
+ inputs=[person_input, cloth_input, outfit_prompt, clothing_prompt],
127
+ label="Примеры для быстрого тестирования"
128
+ )
129
+
130
+ with gr.Column():
131
+ output_image = gr.Image(label="Результат примерки", interactive=False)
132
+
133
+ generate_btn.click(
134
+ fn=generate_vton,
135
+ inputs=[person_input, cloth_input, outfit_prompt, clothing_prompt],
136
+ outputs=output_image
137
+ )
138
+
139
+ gr.Markdown("### Инструкция:")
140
+ gr.Markdown("1. Загрузите четкое фото человека в полный рост\n"
141
+ "2. Загрузите фото одежды на белом фоне\n"
142
+ "3. При необходимости уточните описание образа или одежды\n"
143
+ "4. Нажмите кнопку 'Сгенерировать примерку'")
144
+
145
+ if __name__ == "__main__":
146
+ demo.queue(max_size=3).launch(
147
+ server_name="0.0.0.0" if os.getenv("SPACE_ID") else None,
148
+ share=os.getenv("GRADIO_SHARE") == "True",
149
+ debug=True
150
+ )
configs/VITONHD.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ no_pose: True
2
+ use_jointcond: True
3
+ no_ipadapter: True
4
+
5
+ use_interm_cloth_mask: True
6
+ interm_cloth_start_ratio: 0.5
7
+
8
+ dataset:
9
+ dataset_name: "VITONHDDataset"
10
+ data_root_dir: "./DATA/zalando-hd-resized"
11
+ img_spatial_transform_lst:
12
+ - "randomresizedcrop"
13
+ - "randomaffine"
14
+ cloth_spatial_transform_lst:
15
+ - "randomresizedcrop"
16
+ - "randomaffine"
17
+ img_cloth_spatial_transform_lst:
18
+ - "hflip"
19
+ color_transform_lst:
20
+ - "colorjitter"
21
+ i_drop_rate: 0.05
22
+ pose_type: "densepose"
23
+ train_folder_name: train_fine
24
+ test_folder_name: test_fine
25
+ prompt_version: v12
26
+ text_file_postfix: "gpt4o.json"
27
+ train_folder_name_for_interm_cloth_mask: train_coarse
28
+ test_folder_name_for_interm_cloth_mask: test_coarse
29
+ use_rand_dilate: True
30
+
31
+ rand_dilate_miniter: 0
32
+ rand_dilate_maxiter: 200
lib/caption.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from transformers import AutoProcessor, AutoModelForCausalLM
3
+
4
+
5
+ def generate_caption(image_path, device="cuda"):
6
+ print("Генерация подписи...")
7
+ processor = AutoProcessor.from_pretrained("microsoft/git-base", use_fast=False)
8
+ model = AutoModelForCausalLM.from_pretrained("microsoft/git-base").to(device)
9
+ image = Image.open(image_path).convert("RGB")
10
+
11
+ inputs = processor(images=image, return_tensors="pt").to(device)
12
+ generated_ids = model.generate(
13
+ pixel_values=inputs.pixel_values,
14
+ max_length=50,
15
+ pad_token_id=processor.tokenizer.pad_token_id
16
+ )
17
+ caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
18
+ print("Сгенерированная подпись:", caption)
19
+ return caption
lib/mask.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import SegformerImageProcessor, AutoModelForSemanticSegmentation
2
+ from PIL import Image
3
+ import numpy as np
4
+ import requests
5
+ import torch.nn.functional as F
6
+ import torch
7
+ import os
8
+
9
+ def generate_clothing_mask(
10
+ image_path: str,
11
+ label: int,
12
+ output_path: str = "./output_mask.png",
13
+ model_name: str = "mattmdjaga/segformer_b2_clothes",
14
+ show_result: bool = False
15
+ ) -> Image.Image:
16
+ """
17
+ Генерирует бинарную маску для указанного класса одежды и сохраняет её
18
+
19
+ Args:
20
+ image_path: Путь к изображению или URL
21
+ label: Класс для сегментации (0-17)
22
+ output_path: Путь для сохранения маски
23
+ model_name: Название модели HuggingFace
24
+ show_result: Показать результат matplotlib
25
+
26
+ Returns:
27
+ PIL.Image: Бинарная маска (белый - выбранный класс, черный - остальное)
28
+ """
29
+
30
+ processor = SegformerImageProcessor.from_pretrained(model_name)
31
+ model = AutoModelForSemanticSegmentation.from_pretrained(model_name)
32
+
33
+ if image_path.startswith(('http://', 'https://')):
34
+ image = Image.open(requests.get(image_path, stream=True).raw)
35
+ else:
36
+ image = Image.open(image_path)
37
+
38
+ if image.mode != 'RGB':
39
+ image = image.convert('RGB')
40
+
41
+ image_np = np.array(image)
42
+ if len(image_np.shape) != 3 or image_np.shape[2] != 3:
43
+ raise ValueError("Изображение должно быть в формате RGB (H, W, 3)")
44
+
45
+ inputs = processor(images=image, return_tensors="pt")
46
+ with torch.no_grad():
47
+ outputs = model(**inputs)
48
+
49
+ logits = outputs.logits
50
+ upsampled_logits = F.interpolate(
51
+ logits,
52
+ size=image.size[::-1],
53
+ mode="bilinear",
54
+ align_corners=False,
55
+ )
56
+
57
+ pred_seg = upsampled_logits.argmax(dim=1)[0]
58
+ mask = (pred_seg == label).numpy().astype('uint8') * 255
59
+ mask_image = Image.fromarray(mask)
60
+
61
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
62
+ mask_image.save(output_path)
63
+
64
+ return mask_image
lib/pose.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from controlnet_aux import OpenposeDetector
2
+ from PIL import Image
3
+ import torch
4
+
5
+
6
+ def generate_openpose(
7
+ input_image_path: str,
8
+ output_image_path: str = None,
9
+ device: str = "cuda" if torch.cuda.is_available() else "cpu",
10
+ show_result: bool = False
11
+ ) -> Image.Image:
12
+ """
13
+ Генерирует OpenPose карту позы из входного изображения.
14
+
15
+ Параметры:
16
+ input_image_path (str): Путь к исходному изображению
17
+ output_image_path (str, optional): Путь для сохранения результата. Если None - не сохраняется.
18
+ device (str): Устройство для обработки ('cuda' или 'cpu')
19
+ show_result (bool): Показывать ли результат сразу
20
+
21
+ Возвращает:
22
+ Image.Image: Изображение с OpenPose картой позы
23
+ """
24
+ openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet").to(device)
25
+
26
+ image = Image.open(input_image_path).convert("RGB")
27
+
28
+ openpose_map = openpose(image)
29
+
30
+ if output_image_path:
31
+ openpose_map.save(output_image_path)
32
+
33
+ if show_result:
34
+ openpose_map.show()
35
+
36
+ return image
preprocess/__init__.py ADDED
File without changes
preprocess/humanparsing/__init__.py ADDED
File without changes
preprocess/humanparsing/datasets/__init__.py ADDED
File without changes
preprocess/humanparsing/datasets/datasets.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : datasets.py
8
+ @Time : 8/4/19 3:35 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import os
15
+ import numpy as np
16
+ import random
17
+ import torch
18
+ import cv2
19
+ from torch.utils import data
20
+ from utils.transforms import get_affine_transform
21
+
22
+
23
+ class LIPDataSet(data.Dataset):
24
+ def __init__(self, root, dataset, crop_size=[473, 473], scale_factor=0.25,
25
+ rotation_factor=30, ignore_label=255, transform=None):
26
+ self.root = root
27
+ self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
28
+ self.crop_size = np.asarray(crop_size)
29
+ self.ignore_label = ignore_label
30
+ self.scale_factor = scale_factor
31
+ self.rotation_factor = rotation_factor
32
+ self.flip_prob = 0.5
33
+ self.transform = transform
34
+ self.dataset = dataset
35
+
36
+ list_path = os.path.join(self.root, self.dataset + '_id.txt')
37
+ train_list = [i_id.strip() for i_id in open(list_path)]
38
+
39
+ self.train_list = train_list
40
+ self.number_samples = len(self.train_list)
41
+
42
+ def __len__(self):
43
+ return self.number_samples
44
+
45
+ def _box2cs(self, box):
46
+ x, y, w, h = box[:4]
47
+ return self._xywh2cs(x, y, w, h)
48
+
49
+ def _xywh2cs(self, x, y, w, h):
50
+ center = np.zeros((2), dtype=np.float32)
51
+ center[0] = x + w * 0.5
52
+ center[1] = y + h * 0.5
53
+ if w > self.aspect_ratio * h:
54
+ h = w * 1.0 / self.aspect_ratio
55
+ elif w < self.aspect_ratio * h:
56
+ w = h * self.aspect_ratio
57
+ scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
58
+ return center, scale
59
+
60
+ def __getitem__(self, index):
61
+ train_item = self.train_list[index]
62
+
63
+ im_path = os.path.join(self.root, self.dataset + '_images', train_item + '.jpg')
64
+ parsing_anno_path = os.path.join(self.root, self.dataset + '_segmentations', train_item + '.png')
65
+
66
+ im = cv2.imread(im_path, cv2.IMREAD_COLOR)
67
+ h, w, _ = im.shape
68
+ parsing_anno = np.zeros((h, w), dtype=np.long)
69
+
70
+ # Get person center and scale
71
+ person_center, s = self._box2cs([0, 0, w - 1, h - 1])
72
+ r = 0
73
+
74
+ if self.dataset != 'test':
75
+ # Get pose annotation
76
+ parsing_anno = cv2.imread(parsing_anno_path, cv2.IMREAD_GRAYSCALE)
77
+ if self.dataset == 'train' or self.dataset == 'trainval':
78
+ sf = self.scale_factor
79
+ rf = self.rotation_factor
80
+ s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
81
+ r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0
82
+
83
+ if random.random() <= self.flip_prob:
84
+ im = im[:, ::-1, :]
85
+ parsing_anno = parsing_anno[:, ::-1]
86
+ person_center[0] = im.shape[1] - person_center[0] - 1
87
+ right_idx = [15, 17, 19]
88
+ left_idx = [14, 16, 18]
89
+ for i in range(0, 3):
90
+ right_pos = np.where(parsing_anno == right_idx[i])
91
+ left_pos = np.where(parsing_anno == left_idx[i])
92
+ parsing_anno[right_pos[0], right_pos[1]] = left_idx[i]
93
+ parsing_anno[left_pos[0], left_pos[1]] = right_idx[i]
94
+
95
+ trans = get_affine_transform(person_center, s, r, self.crop_size)
96
+ input = cv2.warpAffine(
97
+ im,
98
+ trans,
99
+ (int(self.crop_size[1]), int(self.crop_size[0])),
100
+ flags=cv2.INTER_LINEAR,
101
+ borderMode=cv2.BORDER_CONSTANT,
102
+ borderValue=(0, 0, 0))
103
+
104
+ if self.transform:
105
+ input = self.transform(input)
106
+
107
+ meta = {
108
+ 'name': train_item,
109
+ 'center': person_center,
110
+ 'height': h,
111
+ 'width': w,
112
+ 'scale': s,
113
+ 'rotation': r
114
+ }
115
+
116
+ if self.dataset == 'val' or self.dataset == 'test':
117
+ return input, meta
118
+ else:
119
+ label_parsing = cv2.warpAffine(
120
+ parsing_anno,
121
+ trans,
122
+ (int(self.crop_size[1]), int(self.crop_size[0])),
123
+ flags=cv2.INTER_NEAREST,
124
+ borderMode=cv2.BORDER_CONSTANT,
125
+ borderValue=(255))
126
+
127
+ label_parsing = torch.from_numpy(label_parsing)
128
+
129
+ return input, label_parsing, meta
130
+
131
+
132
+ class LIPDataValSet(data.Dataset):
133
+ def __init__(self, root, dataset='val', crop_size=[473, 473], transform=None, flip=False):
134
+ self.root = root
135
+ self.crop_size = crop_size
136
+ self.transform = transform
137
+ self.flip = flip
138
+ self.dataset = dataset
139
+ self.root = root
140
+ self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
141
+ self.crop_size = np.asarray(crop_size)
142
+
143
+ list_path = os.path.join(self.root, self.dataset + '_id.txt')
144
+ val_list = [i_id.strip() for i_id in open(list_path)]
145
+
146
+ self.val_list = val_list
147
+ self.number_samples = len(self.val_list)
148
+
149
+ def __len__(self):
150
+ return len(self.val_list)
151
+
152
+ def _box2cs(self, box):
153
+ x, y, w, h = box[:4]
154
+ return self._xywh2cs(x, y, w, h)
155
+
156
+ def _xywh2cs(self, x, y, w, h):
157
+ center = np.zeros((2), dtype=np.float32)
158
+ center[0] = x + w * 0.5
159
+ center[1] = y + h * 0.5
160
+ if w > self.aspect_ratio * h:
161
+ h = w * 1.0 / self.aspect_ratio
162
+ elif w < self.aspect_ratio * h:
163
+ w = h * self.aspect_ratio
164
+ scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
165
+
166
+ return center, scale
167
+
168
+ def __getitem__(self, index):
169
+ val_item = self.val_list[index]
170
+ # Load training image
171
+ im_path = os.path.join(self.root, self.dataset + '_images', val_item + '.jpg')
172
+ im = cv2.imread(im_path, cv2.IMREAD_COLOR)
173
+ h, w, _ = im.shape
174
+ # Get person center and scale
175
+ person_center, s = self._box2cs([0, 0, w - 1, h - 1])
176
+ r = 0
177
+ trans = get_affine_transform(person_center, s, r, self.crop_size)
178
+ input = cv2.warpAffine(
179
+ im,
180
+ trans,
181
+ (int(self.crop_size[1]), int(self.crop_size[0])),
182
+ flags=cv2.INTER_LINEAR,
183
+ borderMode=cv2.BORDER_CONSTANT,
184
+ borderValue=(0, 0, 0))
185
+ input = self.transform(input)
186
+ flip_input = input.flip(dims=[-1])
187
+ if self.flip:
188
+ batch_input_im = torch.stack([input, flip_input])
189
+ else:
190
+ batch_input_im = input
191
+
192
+ meta = {
193
+ 'name': val_item,
194
+ 'center': person_center,
195
+ 'height': h,
196
+ 'width': w,
197
+ 'scale': s,
198
+ 'rotation': r
199
+ }
200
+
201
+ return batch_input_im, meta
preprocess/humanparsing/datasets/simple_extractor_dataset.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : dataset.py
8
+ @Time : 8/30/19 9:12 PM
9
+ @Desc : Dataset Definition
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import os
15
+ import pdb
16
+
17
+ import cv2
18
+ import numpy as np
19
+ from PIL import Image
20
+ from torch.utils import data
21
+ from utils.transforms import get_affine_transform
22
+
23
+
24
+ class SimpleFolderDataset(data.Dataset):
25
+ def __init__(self, root, input_size=[512, 512], transform=None):
26
+ self.root = root
27
+ self.input_size = input_size
28
+ self.transform = transform
29
+ self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
30
+ self.input_size = np.asarray(input_size)
31
+ self.is_pil_image = False
32
+ if isinstance(root, Image.Image):
33
+ self.file_list = [root]
34
+ self.is_pil_image = True
35
+ elif os.path.isfile(root):
36
+ self.file_list = [os.path.basename(root)]
37
+ self.root = os.path.dirname(root)
38
+ else:
39
+ self.file_list = os.listdir(self.root)
40
+
41
+ def __len__(self):
42
+ return len(self.file_list)
43
+
44
+ def _box2cs(self, box):
45
+ x, y, w, h = box[:4]
46
+ return self._xywh2cs(x, y, w, h)
47
+
48
+ def _xywh2cs(self, x, y, w, h):
49
+ center = np.zeros((2), dtype=np.float32)
50
+ center[0] = x + w * 0.5
51
+ center[1] = y + h * 0.5
52
+ if w > self.aspect_ratio * h:
53
+ h = w * 1.0 / self.aspect_ratio
54
+ elif w < self.aspect_ratio * h:
55
+ w = h * self.aspect_ratio
56
+ scale = np.array([w, h], dtype=np.float32)
57
+ return center, scale
58
+
59
+ def __getitem__(self, index):
60
+ if self.is_pil_image:
61
+ img = np.asarray(self.file_list[index])[:, :, [2, 1, 0]]
62
+ else:
63
+ img_name = self.file_list[index]
64
+ img_path = os.path.join(self.root, img_name)
65
+ img = cv2.imread(img_path, cv2.IMREAD_COLOR)
66
+ h, w, _ = img.shape
67
+
68
+ # Get person center and scale
69
+ person_center, s = self._box2cs([0, 0, w - 1, h - 1])
70
+ r = 0
71
+ trans = get_affine_transform(person_center, s, r, self.input_size)
72
+ input = cv2.warpAffine(
73
+ img,
74
+ trans,
75
+ (int(self.input_size[1]), int(self.input_size[0])),
76
+ flags=cv2.INTER_LINEAR,
77
+ borderMode=cv2.BORDER_CONSTANT,
78
+ borderValue=(0, 0, 0))
79
+
80
+ input = self.transform(input)
81
+ meta = {
82
+ 'center': person_center,
83
+ 'height': h,
84
+ 'width': w,
85
+ 'scale': s,
86
+ 'rotation': r
87
+ }
88
+
89
+ return input, meta
preprocess/humanparsing/datasets/target_generation.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+
5
+ def generate_edge_tensor(label, edge_width=3):
6
+ label = label.type(torch.cuda.FloatTensor)
7
+ if len(label.shape) == 2:
8
+ label = label.unsqueeze(0)
9
+ n, h, w = label.shape
10
+ edge = torch.zeros(label.shape, dtype=torch.float).cuda()
11
+ # right
12
+ edge_right = edge[:, 1:h, :]
13
+ edge_right[(label[:, 1:h, :] != label[:, :h - 1, :]) & (label[:, 1:h, :] != 255)
14
+ & (label[:, :h - 1, :] != 255)] = 1
15
+
16
+ # up
17
+ edge_up = edge[:, :, :w - 1]
18
+ edge_up[(label[:, :, :w - 1] != label[:, :, 1:w])
19
+ & (label[:, :, :w - 1] != 255)
20
+ & (label[:, :, 1:w] != 255)] = 1
21
+
22
+ # upright
23
+ edge_upright = edge[:, :h - 1, :w - 1]
24
+ edge_upright[(label[:, :h - 1, :w - 1] != label[:, 1:h, 1:w])
25
+ & (label[:, :h - 1, :w - 1] != 255)
26
+ & (label[:, 1:h, 1:w] != 255)] = 1
27
+
28
+ # bottomright
29
+ edge_bottomright = edge[:, :h - 1, 1:w]
30
+ edge_bottomright[(label[:, :h - 1, 1:w] != label[:, 1:h, :w - 1])
31
+ & (label[:, :h - 1, 1:w] != 255)
32
+ & (label[:, 1:h, :w - 1] != 255)] = 1
33
+
34
+ kernel = torch.ones((1, 1, edge_width, edge_width), dtype=torch.float).cuda()
35
+ with torch.no_grad():
36
+ edge = edge.unsqueeze(1)
37
+ edge = F.conv2d(edge, kernel, stride=1, padding=1)
38
+ edge[edge!=0] = 1
39
+ edge = edge.squeeze()
40
+ return edge
preprocess/humanparsing/modules/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .bn import ABN, InPlaceABN, InPlaceABNSync
2
+ from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
3
+ from .misc import GlobalAvgPool2d, SingleGPU
4
+ from .residual import IdentityResidualBlock
5
+ from .dense import DenseModule
preprocess/humanparsing/modules/bn.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as functional
4
+
5
+ try:
6
+ from queue import Queue
7
+ except ImportError:
8
+ from Queue import Queue
9
+
10
+ from .functions import *
11
+
12
+
13
+ class ABN(nn.Module):
14
+ """Activated Batch Normalization
15
+
16
+ This gathers a `BatchNorm2d` and an activation function in a single module
17
+ """
18
+
19
+ def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
20
+ """Creates an Activated Batch Normalization module
21
+
22
+ Parameters
23
+ ----------
24
+ num_features : int
25
+ Number of feature channels in the input and output.
26
+ eps : float
27
+ Small constant to prevent numerical issues.
28
+ momentum : float
29
+ Momentum factor applied to compute running statistics as.
30
+ affine : bool
31
+ If `True` apply learned scale and shift transformation after normalization.
32
+ activation : str
33
+ Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
34
+ slope : float
35
+ Negative slope for the `leaky_relu` activation.
36
+ """
37
+ super(ABN, self).__init__()
38
+ self.num_features = num_features
39
+ self.affine = affine
40
+ self.eps = eps
41
+ self.momentum = momentum
42
+ self.activation = activation
43
+ self.slope = slope
44
+ if self.affine:
45
+ self.weight = nn.Parameter(torch.ones(num_features))
46
+ self.bias = nn.Parameter(torch.zeros(num_features))
47
+ else:
48
+ self.register_parameter('weight', None)
49
+ self.register_parameter('bias', None)
50
+ self.register_buffer('running_mean', torch.zeros(num_features))
51
+ self.register_buffer('running_var', torch.ones(num_features))
52
+ self.reset_parameters()
53
+
54
+ def reset_parameters(self):
55
+ nn.init.constant_(self.running_mean, 0)
56
+ nn.init.constant_(self.running_var, 1)
57
+ if self.affine:
58
+ nn.init.constant_(self.weight, 1)
59
+ nn.init.constant_(self.bias, 0)
60
+
61
+ def forward(self, x):
62
+ x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
63
+ self.training, self.momentum, self.eps)
64
+
65
+ if self.activation == ACT_RELU:
66
+ return functional.relu(x, inplace=True)
67
+ elif self.activation == ACT_LEAKY_RELU:
68
+ return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
69
+ elif self.activation == ACT_ELU:
70
+ return functional.elu(x, inplace=True)
71
+ else:
72
+ return x
73
+
74
+ def __repr__(self):
75
+ rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
76
+ ' affine={affine}, activation={activation}'
77
+ if self.activation == "leaky_relu":
78
+ rep += ', slope={slope})'
79
+ else:
80
+ rep += ')'
81
+ return rep.format(name=self.__class__.__name__, **self.__dict__)
82
+
83
+
84
+ class InPlaceABN(ABN):
85
+ """InPlace Activated Batch Normalization"""
86
+
87
+ def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
88
+ """Creates an InPlace Activated Batch Normalization module
89
+
90
+ Parameters
91
+ ----------
92
+ num_features : int
93
+ Number of feature channels in the input and output.
94
+ eps : float
95
+ Small constant to prevent numerical issues.
96
+ momentum : float
97
+ Momentum factor applied to compute running statistics as.
98
+ affine : bool
99
+ If `True` apply learned scale and shift transformation after normalization.
100
+ activation : str
101
+ Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
102
+ slope : float
103
+ Negative slope for the `leaky_relu` activation.
104
+ """
105
+ super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
106
+
107
+ def forward(self, x):
108
+ x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
109
+ self.training, self.momentum, self.eps, self.activation, self.slope)
110
+ return x
111
+
112
+
113
+ class InPlaceABNSync(ABN):
114
+ """InPlace Activated Batch Normalization with cross-GPU synchronization
115
+ This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
116
+ """
117
+
118
+ def forward(self, x):
119
+ x, _, _ = inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
120
+ self.training, self.momentum, self.eps, self.activation, self.slope)
121
+ return x
122
+
123
+ def __repr__(self):
124
+ rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
125
+ ' affine={affine}, activation={activation}'
126
+ if self.activation == "leaky_relu":
127
+ rep += ', slope={slope})'
128
+ else:
129
+ rep += ')'
130
+ return rep.format(name=self.__class__.__name__, **self.__dict__)
131
+
132
+
preprocess/humanparsing/modules/deeplab.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as functional
4
+
5
+ from models._util import try_index
6
+ from .bn import ABN
7
+
8
+
9
+ class DeeplabV3(nn.Module):
10
+ def __init__(self,
11
+ in_channels,
12
+ out_channels,
13
+ hidden_channels=256,
14
+ dilations=(12, 24, 36),
15
+ norm_act=ABN,
16
+ pooling_size=None):
17
+ super(DeeplabV3, self).__init__()
18
+ self.pooling_size = pooling_size
19
+
20
+ self.map_convs = nn.ModuleList([
21
+ nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
22
+ nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
23
+ nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
24
+ nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
25
+ ])
26
+ self.map_bn = norm_act(hidden_channels * 4)
27
+
28
+ self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
29
+ self.global_pooling_bn = norm_act(hidden_channels)
30
+
31
+ self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
32
+ self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
33
+ self.red_bn = norm_act(out_channels)
34
+
35
+ self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
36
+
37
+ def reset_parameters(self, activation, slope):
38
+ gain = nn.init.calculate_gain(activation, slope)
39
+ for m in self.modules():
40
+ if isinstance(m, nn.Conv2d):
41
+ nn.init.xavier_normal_(m.weight.data, gain)
42
+ if hasattr(m, "bias") and m.bias is not None:
43
+ nn.init.constant_(m.bias, 0)
44
+ elif isinstance(m, ABN):
45
+ if hasattr(m, "weight") and m.weight is not None:
46
+ nn.init.constant_(m.weight, 1)
47
+ if hasattr(m, "bias") and m.bias is not None:
48
+ nn.init.constant_(m.bias, 0)
49
+
50
+ def forward(self, x):
51
+ # Map convolutions
52
+ out = torch.cat([m(x) for m in self.map_convs], dim=1)
53
+ out = self.map_bn(out)
54
+ out = self.red_conv(out)
55
+
56
+ # Global pooling
57
+ pool = self._global_pooling(x)
58
+ pool = self.global_pooling_conv(pool)
59
+ pool = self.global_pooling_bn(pool)
60
+ pool = self.pool_red_conv(pool)
61
+ if self.training or self.pooling_size is None:
62
+ pool = pool.repeat(1, 1, x.size(2), x.size(3))
63
+
64
+ out += pool
65
+ out = self.red_bn(out)
66
+ return out
67
+
68
+ def _global_pooling(self, x):
69
+ if self.training or self.pooling_size is None:
70
+ pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
71
+ pool = pool.view(x.size(0), x.size(1), 1, 1)
72
+ else:
73
+ pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
74
+ min(try_index(self.pooling_size, 1), x.shape[3]))
75
+ padding = (
76
+ (pooling_size[1] - 1) // 2,
77
+ (pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
78
+ (pooling_size[0] - 1) // 2,
79
+ (pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
80
+ )
81
+
82
+ pool = functional.avg_pool2d(x, pooling_size, stride=1)
83
+ pool = functional.pad(pool, pad=padding, mode="replicate")
84
+ return pool
preprocess/humanparsing/modules/dense.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ from .bn import ABN
7
+
8
+
9
+ class DenseModule(nn.Module):
10
+ def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
11
+ super(DenseModule, self).__init__()
12
+ self.in_channels = in_channels
13
+ self.growth = growth
14
+ self.layers = layers
15
+
16
+ self.convs1 = nn.ModuleList()
17
+ self.convs3 = nn.ModuleList()
18
+ for i in range(self.layers):
19
+ self.convs1.append(nn.Sequential(OrderedDict([
20
+ ("bn", norm_act(in_channels)),
21
+ ("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
22
+ ])))
23
+ self.convs3.append(nn.Sequential(OrderedDict([
24
+ ("bn", norm_act(self.growth * bottleneck_factor)),
25
+ ("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
26
+ dilation=dilation))
27
+ ])))
28
+ in_channels += self.growth
29
+
30
+ @property
31
+ def out_channels(self):
32
+ return self.in_channels + self.growth * self.layers
33
+
34
+ def forward(self, x):
35
+ inputs = [x]
36
+ for i in range(self.layers):
37
+ x = torch.cat(inputs, dim=1)
38
+ x = self.convs1[i](x)
39
+ x = self.convs3[i](x)
40
+ inputs += [x]
41
+
42
+ return torch.cat(inputs, dim=1)
preprocess/humanparsing/modules/functions.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+ from os import path
3
+ import torch
4
+ import torch.distributed as dist
5
+ import torch.autograd as autograd
6
+ import torch.cuda.comm as comm
7
+ from torch.autograd.function import once_differentiable
8
+ from torch.utils.cpp_extension import load
9
+
10
+ _src_path = path.join(path.dirname(path.abspath(__file__)), "src")
11
+ _backend = load(name="inplace_abn",
12
+ extra_cflags=["-O3"],
13
+ sources=[path.join(_src_path, f) for f in [
14
+ "inplace_abn.cpp",
15
+ "inplace_abn_cpu.cpp",
16
+ "inplace_abn_cuda.cu",
17
+ "inplace_abn_cuda_half.cu"
18
+ ]],
19
+ extra_cuda_cflags=["--expt-extended-lambda"])
20
+
21
+ # Activation names
22
+ ACT_RELU = "relu"
23
+ ACT_LEAKY_RELU = "leaky_relu"
24
+ ACT_ELU = "elu"
25
+ ACT_NONE = "none"
26
+
27
+
28
+ def _check(fn, *args, **kwargs):
29
+ success = fn(*args, **kwargs)
30
+ if not success:
31
+ raise RuntimeError("CUDA Error encountered in {}".format(fn))
32
+
33
+
34
+ def _broadcast_shape(x):
35
+ out_size = []
36
+ for i, s in enumerate(x.size()):
37
+ if i != 1:
38
+ out_size.append(1)
39
+ else:
40
+ out_size.append(s)
41
+ return out_size
42
+
43
+
44
+ def _reduce(x):
45
+ if len(x.size()) == 2:
46
+ return x.sum(dim=0)
47
+ else:
48
+ n, c = x.size()[0:2]
49
+ return x.contiguous().view((n, c, -1)).sum(2).sum(0)
50
+
51
+
52
+ def _count_samples(x):
53
+ count = 1
54
+ for i, s in enumerate(x.size()):
55
+ if i != 1:
56
+ count *= s
57
+ return count
58
+
59
+
60
+ def _act_forward(ctx, x):
61
+ if ctx.activation == ACT_LEAKY_RELU:
62
+ _backend.leaky_relu_forward(x, ctx.slope)
63
+ elif ctx.activation == ACT_ELU:
64
+ _backend.elu_forward(x)
65
+ elif ctx.activation == ACT_NONE:
66
+ pass
67
+
68
+
69
+ def _act_backward(ctx, x, dx):
70
+ if ctx.activation == ACT_LEAKY_RELU:
71
+ _backend.leaky_relu_backward(x, dx, ctx.slope)
72
+ elif ctx.activation == ACT_ELU:
73
+ _backend.elu_backward(x, dx)
74
+ elif ctx.activation == ACT_NONE:
75
+ pass
76
+
77
+
78
+ class InPlaceABN(autograd.Function):
79
+ @staticmethod
80
+ def forward(ctx, x, weight, bias, running_mean, running_var,
81
+ training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
82
+ # Save context
83
+ ctx.training = training
84
+ ctx.momentum = momentum
85
+ ctx.eps = eps
86
+ ctx.activation = activation
87
+ ctx.slope = slope
88
+ ctx.affine = weight is not None and bias is not None
89
+
90
+ # Prepare inputs
91
+ count = _count_samples(x)
92
+ x = x.contiguous()
93
+ weight = weight.contiguous() if ctx.affine else x.new_empty(0)
94
+ bias = bias.contiguous() if ctx.affine else x.new_empty(0)
95
+
96
+ if ctx.training:
97
+ mean, var = _backend.mean_var(x)
98
+
99
+ # Update running stats
100
+ running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
101
+ running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
102
+
103
+ # Mark in-place modified tensors
104
+ ctx.mark_dirty(x, running_mean, running_var)
105
+ else:
106
+ mean, var = running_mean.contiguous(), running_var.contiguous()
107
+ ctx.mark_dirty(x)
108
+
109
+ # BN forward + activation
110
+ _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
111
+ _act_forward(ctx, x)
112
+
113
+ # Output
114
+ ctx.var = var
115
+ ctx.save_for_backward(x, var, weight, bias)
116
+ ctx.mark_non_differentiable(running_mean, running_var)
117
+ return x, running_mean, running_var
118
+
119
+ @staticmethod
120
+ @once_differentiable
121
+ def backward(ctx, dz, _drunning_mean, _drunning_var):
122
+ z, var, weight, bias = ctx.saved_tensors
123
+ dz = dz.contiguous()
124
+
125
+ # Undo activation
126
+ _act_backward(ctx, z, dz)
127
+
128
+ if ctx.training:
129
+ edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
130
+ else:
131
+ # TODO: implement simplified CUDA backward for inference mode
132
+ edz = dz.new_zeros(dz.size(1))
133
+ eydz = dz.new_zeros(dz.size(1))
134
+
135
+ dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
136
+ # dweight = eydz * weight.sign() if ctx.affine else None
137
+ dweight = eydz if ctx.affine else None
138
+ if dweight is not None:
139
+ dweight[weight < 0] *= -1
140
+ dbias = edz if ctx.affine else None
141
+
142
+ return dx, dweight, dbias, None, None, None, None, None, None, None
143
+
144
+
145
+ class InPlaceABNSync(autograd.Function):
146
+ @classmethod
147
+ def forward(cls, ctx, x, weight, bias, running_mean, running_var,
148
+ training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
149
+ # Save context
150
+ ctx.training = training
151
+ ctx.momentum = momentum
152
+ ctx.eps = eps
153
+ ctx.activation = activation
154
+ ctx.slope = slope
155
+ ctx.affine = weight is not None and bias is not None
156
+
157
+ # Prepare inputs
158
+ ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
159
+
160
+ # count = _count_samples(x)
161
+ batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
162
+
163
+ x = x.contiguous()
164
+ weight = weight.contiguous() if ctx.affine else x.new_empty(0)
165
+ bias = bias.contiguous() if ctx.affine else x.new_empty(0)
166
+
167
+ if ctx.training:
168
+ mean, var = _backend.mean_var(x)
169
+ if ctx.world_size > 1:
170
+ # get global batch size
171
+ if equal_batches:
172
+ batch_size *= ctx.world_size
173
+ else:
174
+ dist.all_reduce(batch_size, dist.ReduceOp.SUM)
175
+
176
+ ctx.factor = x.shape[0] / float(batch_size.item())
177
+
178
+ mean_all = mean.clone() * ctx.factor
179
+ dist.all_reduce(mean_all, dist.ReduceOp.SUM)
180
+
181
+ var_all = (var + (mean - mean_all) ** 2) * ctx.factor
182
+ dist.all_reduce(var_all, dist.ReduceOp.SUM)
183
+
184
+ mean = mean_all
185
+ var = var_all
186
+
187
+ # Update running stats
188
+ running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
189
+ count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
190
+ running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
191
+
192
+ # Mark in-place modified tensors
193
+ ctx.mark_dirty(x, running_mean, running_var)
194
+ else:
195
+ mean, var = running_mean.contiguous(), running_var.contiguous()
196
+ ctx.mark_dirty(x)
197
+
198
+ # BN forward + activation
199
+ _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
200
+ _act_forward(ctx, x)
201
+
202
+ # Output
203
+ ctx.var = var
204
+ ctx.save_for_backward(x, var, weight, bias)
205
+ ctx.mark_non_differentiable(running_mean, running_var)
206
+ return x, running_mean, running_var
207
+
208
+ @staticmethod
209
+ @once_differentiable
210
+ def backward(ctx, dz, _drunning_mean, _drunning_var):
211
+ z, var, weight, bias = ctx.saved_tensors
212
+ dz = dz.contiguous()
213
+
214
+ # Undo activation
215
+ _act_backward(ctx, z, dz)
216
+
217
+ if ctx.training:
218
+ edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
219
+ edz_local = edz.clone()
220
+ eydz_local = eydz.clone()
221
+
222
+ if ctx.world_size > 1:
223
+ edz *= ctx.factor
224
+ dist.all_reduce(edz, dist.ReduceOp.SUM)
225
+
226
+ eydz *= ctx.factor
227
+ dist.all_reduce(eydz, dist.ReduceOp.SUM)
228
+ else:
229
+ edz_local = edz = dz.new_zeros(dz.size(1))
230
+ eydz_local = eydz = dz.new_zeros(dz.size(1))
231
+
232
+ dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
233
+ # dweight = eydz_local * weight.sign() if ctx.affine else None
234
+ dweight = eydz_local if ctx.affine else None
235
+ if dweight is not None:
236
+ dweight[weight < 0] *= -1
237
+ dbias = edz_local if ctx.affine else None
238
+
239
+ return dx, dweight, dbias, None, None, None, None, None, None, None
240
+
241
+
242
+ inplace_abn = InPlaceABN.apply
243
+ inplace_abn_sync = InPlaceABNSync.apply
244
+
245
+ __all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]
preprocess/humanparsing/modules/misc.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ import torch.distributed as dist
4
+
5
+ class GlobalAvgPool2d(nn.Module):
6
+ def __init__(self):
7
+ """Global average pooling over the input's spatial dimensions"""
8
+ super(GlobalAvgPool2d, self).__init__()
9
+
10
+ def forward(self, inputs):
11
+ in_size = inputs.size()
12
+ return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
13
+
14
+ class SingleGPU(nn.Module):
15
+ def __init__(self, module):
16
+ super(SingleGPU, self).__init__()
17
+ self.module=module
18
+
19
+ def forward(self, input):
20
+ return self.module(input.cuda(non_blocking=True))
21
+
preprocess/humanparsing/modules/residual.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ import torch.nn as nn
4
+
5
+ from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
6
+ import torch.nn.functional as functional
7
+
8
+
9
+ class ResidualBlock(nn.Module):
10
+ """Configurable residual block
11
+
12
+ Parameters
13
+ ----------
14
+ in_channels : int
15
+ Number of input channels.
16
+ channels : list of int
17
+ Number of channels in the internal feature maps. Can either have two or three elements: if three construct
18
+ a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
19
+ `3 x 3` then `1 x 1` convolutions.
20
+ stride : int
21
+ Stride of the first `3 x 3` convolution
22
+ dilation : int
23
+ Dilation to apply to the `3 x 3` convolutions.
24
+ groups : int
25
+ Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
26
+ bottleneck blocks.
27
+ norm_act : callable
28
+ Function to create normalization / activation Module.
29
+ dropout: callable
30
+ Function to create Dropout Module.
31
+ """
32
+
33
+ def __init__(self,
34
+ in_channels,
35
+ channels,
36
+ stride=1,
37
+ dilation=1,
38
+ groups=1,
39
+ norm_act=ABN,
40
+ dropout=None):
41
+ super(ResidualBlock, self).__init__()
42
+
43
+ # Check parameters for inconsistencies
44
+ if len(channels) != 2 and len(channels) != 3:
45
+ raise ValueError("channels must contain either two or three values")
46
+ if len(channels) == 2 and groups != 1:
47
+ raise ValueError("groups > 1 are only valid if len(channels) == 3")
48
+
49
+ is_bottleneck = len(channels) == 3
50
+ need_proj_conv = stride != 1 or in_channels != channels[-1]
51
+
52
+ if not is_bottleneck:
53
+ bn2 = norm_act(channels[1])
54
+ bn2.activation = ACT_NONE
55
+ layers = [
56
+ ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
57
+ dilation=dilation)),
58
+ ("bn1", norm_act(channels[0])),
59
+ ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
60
+ dilation=dilation)),
61
+ ("bn2", bn2)
62
+ ]
63
+ if dropout is not None:
64
+ layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
65
+ else:
66
+ bn3 = norm_act(channels[2])
67
+ bn3.activation = ACT_NONE
68
+ layers = [
69
+ ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
70
+ ("bn1", norm_act(channels[0])),
71
+ ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
72
+ groups=groups, dilation=dilation)),
73
+ ("bn2", norm_act(channels[1])),
74
+ ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
75
+ ("bn3", bn3)
76
+ ]
77
+ if dropout is not None:
78
+ layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
79
+ self.convs = nn.Sequential(OrderedDict(layers))
80
+
81
+ if need_proj_conv:
82
+ self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
83
+ self.proj_bn = norm_act(channels[-1])
84
+ self.proj_bn.activation = ACT_NONE
85
+
86
+ def forward(self, x):
87
+ if hasattr(self, "proj_conv"):
88
+ residual = self.proj_conv(x)
89
+ residual = self.proj_bn(residual)
90
+ else:
91
+ residual = x
92
+ x = self.convs(x) + residual
93
+
94
+ if self.convs.bn1.activation == ACT_LEAKY_RELU:
95
+ return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
96
+ elif self.convs.bn1.activation == ACT_ELU:
97
+ return functional.elu(x, inplace=True)
98
+ else:
99
+ return x
100
+
101
+
102
+ class IdentityResidualBlock(nn.Module):
103
+ def __init__(self,
104
+ in_channels,
105
+ channels,
106
+ stride=1,
107
+ dilation=1,
108
+ groups=1,
109
+ norm_act=ABN,
110
+ dropout=None):
111
+ """Configurable identity-mapping residual block
112
+
113
+ Parameters
114
+ ----------
115
+ in_channels : int
116
+ Number of input channels.
117
+ channels : list of int
118
+ Number of channels in the internal feature maps. Can either have two or three elements: if three construct
119
+ a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
120
+ `3 x 3` then `1 x 1` convolutions.
121
+ stride : int
122
+ Stride of the first `3 x 3` convolution
123
+ dilation : int
124
+ Dilation to apply to the `3 x 3` convolutions.
125
+ groups : int
126
+ Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
127
+ bottleneck blocks.
128
+ norm_act : callable
129
+ Function to create normalization / activation Module.
130
+ dropout: callable
131
+ Function to create Dropout Module.
132
+ """
133
+ super(IdentityResidualBlock, self).__init__()
134
+
135
+ # Check parameters for inconsistencies
136
+ if len(channels) != 2 and len(channels) != 3:
137
+ raise ValueError("channels must contain either two or three values")
138
+ if len(channels) == 2 and groups != 1:
139
+ raise ValueError("groups > 1 are only valid if len(channels) == 3")
140
+
141
+ is_bottleneck = len(channels) == 3
142
+ need_proj_conv = stride != 1 or in_channels != channels[-1]
143
+
144
+ self.bn1 = norm_act(in_channels)
145
+ if not is_bottleneck:
146
+ layers = [
147
+ ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
148
+ dilation=dilation)),
149
+ ("bn2", norm_act(channels[0])),
150
+ ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
151
+ dilation=dilation))
152
+ ]
153
+ if dropout is not None:
154
+ layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
155
+ else:
156
+ layers = [
157
+ ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
158
+ ("bn2", norm_act(channels[0])),
159
+ ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
160
+ groups=groups, dilation=dilation)),
161
+ ("bn3", norm_act(channels[1])),
162
+ ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
163
+ ]
164
+ if dropout is not None:
165
+ layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
166
+ self.convs = nn.Sequential(OrderedDict(layers))
167
+
168
+ if need_proj_conv:
169
+ self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
170
+
171
+ def forward(self, x):
172
+ if hasattr(self, "proj_conv"):
173
+ bn1 = self.bn1(x)
174
+ shortcut = self.proj_conv(bn1)
175
+ else:
176
+ shortcut = x.clone()
177
+ bn1 = self.bn1(x)
178
+
179
+ out = self.convs(bn1)
180
+ out.add_(shortcut)
181
+
182
+ return out
preprocess/humanparsing/modules/src/checks.h ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/ATen.h>
4
+
5
+ // Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
6
+ #ifndef AT_CHECK
7
+ #define AT_CHECK AT_ASSERT
8
+ #endif
9
+
10
+ #define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
11
+ #define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
12
+ #define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
13
+
14
+ #define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
15
+ #define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
preprocess/humanparsing/modules/src/inplace_abn.cpp ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <torch/extension.h>
2
+
3
+ #include <vector>
4
+
5
+ #include "inplace_abn.h"
6
+
7
+ std::vector<at::Tensor> mean_var(at::Tensor x) {
8
+ if (x.is_cuda()) {
9
+ if (x.type().scalarType() == at::ScalarType::Half) {
10
+ return mean_var_cuda_h(x);
11
+ } else {
12
+ return mean_var_cuda(x);
13
+ }
14
+ } else {
15
+ return mean_var_cpu(x);
16
+ }
17
+ }
18
+
19
+ at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
20
+ bool affine, float eps) {
21
+ if (x.is_cuda()) {
22
+ if (x.type().scalarType() == at::ScalarType::Half) {
23
+ return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
24
+ } else {
25
+ return forward_cuda(x, mean, var, weight, bias, affine, eps);
26
+ }
27
+ } else {
28
+ return forward_cpu(x, mean, var, weight, bias, affine, eps);
29
+ }
30
+ }
31
+
32
+ std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
33
+ bool affine, float eps) {
34
+ if (z.is_cuda()) {
35
+ if (z.type().scalarType() == at::ScalarType::Half) {
36
+ return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
37
+ } else {
38
+ return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
39
+ }
40
+ } else {
41
+ return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
42
+ }
43
+ }
44
+
45
+ at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
46
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
47
+ if (z.is_cuda()) {
48
+ if (z.type().scalarType() == at::ScalarType::Half) {
49
+ return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
50
+ } else {
51
+ return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
52
+ }
53
+ } else {
54
+ return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
55
+ }
56
+ }
57
+
58
+ void leaky_relu_forward(at::Tensor z, float slope) {
59
+ at::leaky_relu_(z, slope);
60
+ }
61
+
62
+ void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
63
+ if (z.is_cuda()) {
64
+ if (z.type().scalarType() == at::ScalarType::Half) {
65
+ return leaky_relu_backward_cuda_h(z, dz, slope);
66
+ } else {
67
+ return leaky_relu_backward_cuda(z, dz, slope);
68
+ }
69
+ } else {
70
+ return leaky_relu_backward_cpu(z, dz, slope);
71
+ }
72
+ }
73
+
74
+ void elu_forward(at::Tensor z) {
75
+ at::elu_(z);
76
+ }
77
+
78
+ void elu_backward(at::Tensor z, at::Tensor dz) {
79
+ if (z.is_cuda()) {
80
+ return elu_backward_cuda(z, dz);
81
+ } else {
82
+ return elu_backward_cpu(z, dz);
83
+ }
84
+ }
85
+
86
+ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
87
+ m.def("mean_var", &mean_var, "Mean and variance computation");
88
+ m.def("forward", &forward, "In-place forward computation");
89
+ m.def("edz_eydz", &edz_eydz, "First part of backward computation");
90
+ m.def("backward", &backward, "Second part of backward computation");
91
+ m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
92
+ m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
93
+ m.def("elu_forward", &elu_forward, "Elu forward computation");
94
+ m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
95
+ }
preprocess/humanparsing/modules/src/inplace_abn.h ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/ATen.h>
4
+
5
+ #include <vector>
6
+
7
+ std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
8
+ std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
9
+ std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
10
+
11
+ at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
12
+ bool affine, float eps);
13
+ at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
14
+ bool affine, float eps);
15
+ at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
16
+ bool affine, float eps);
17
+
18
+ std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
19
+ bool affine, float eps);
20
+ std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
21
+ bool affine, float eps);
22
+ std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
23
+ bool affine, float eps);
24
+
25
+ at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
26
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps);
27
+ at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
28
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps);
29
+ at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
30
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps);
31
+
32
+ void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
33
+ void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
34
+ void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
35
+
36
+ void elu_backward_cpu(at::Tensor z, at::Tensor dz);
37
+ void elu_backward_cuda(at::Tensor z, at::Tensor dz);
38
+
39
+ static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
40
+ num = x.size(0);
41
+ chn = x.size(1);
42
+ sp = 1;
43
+ for (int64_t i = 2; i < x.ndimension(); ++i)
44
+ sp *= x.size(i);
45
+ }
46
+
47
+ /*
48
+ * Specialized CUDA reduction functions for BN
49
+ */
50
+ #ifdef __CUDACC__
51
+
52
+ #include "utils/cuda.cuh"
53
+
54
+ template <typename T, typename Op>
55
+ __device__ T reduce(Op op, int plane, int N, int S) {
56
+ T sum = (T)0;
57
+ for (int batch = 0; batch < N; ++batch) {
58
+ for (int x = threadIdx.x; x < S; x += blockDim.x) {
59
+ sum += op(batch, plane, x);
60
+ }
61
+ }
62
+
63
+ // sum over NumThreads within a warp
64
+ sum = warpSum(sum);
65
+
66
+ // 'transpose', and reduce within warp again
67
+ __shared__ T shared[32];
68
+ __syncthreads();
69
+ if (threadIdx.x % WARP_SIZE == 0) {
70
+ shared[threadIdx.x / WARP_SIZE] = sum;
71
+ }
72
+ if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
73
+ // zero out the other entries in shared
74
+ shared[threadIdx.x] = (T)0;
75
+ }
76
+ __syncthreads();
77
+ if (threadIdx.x / WARP_SIZE == 0) {
78
+ sum = warpSum(shared[threadIdx.x]);
79
+ if (threadIdx.x == 0) {
80
+ shared[0] = sum;
81
+ }
82
+ }
83
+ __syncthreads();
84
+
85
+ // Everyone picks it up, should be broadcast into the whole gradInput
86
+ return shared[0];
87
+ }
88
+ #endif
preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/ATen.h>
2
+
3
+ #include <vector>
4
+
5
+ #include "utils/checks.h"
6
+ #include "inplace_abn.h"
7
+
8
+ at::Tensor reduce_sum(at::Tensor x) {
9
+ if (x.ndimension() == 2) {
10
+ return x.sum(0);
11
+ } else {
12
+ auto x_view = x.view({x.size(0), x.size(1), -1});
13
+ return x_view.sum(-1).sum(0);
14
+ }
15
+ }
16
+
17
+ at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
18
+ if (x.ndimension() == 2) {
19
+ return v;
20
+ } else {
21
+ std::vector<int64_t> broadcast_size = {1, -1};
22
+ for (int64_t i = 2; i < x.ndimension(); ++i)
23
+ broadcast_size.push_back(1);
24
+
25
+ return v.view(broadcast_size);
26
+ }
27
+ }
28
+
29
+ int64_t count(at::Tensor x) {
30
+ int64_t count = x.size(0);
31
+ for (int64_t i = 2; i < x.ndimension(); ++i)
32
+ count *= x.size(i);
33
+
34
+ return count;
35
+ }
36
+
37
+ at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
38
+ if (affine) {
39
+ return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
40
+ } else {
41
+ return z;
42
+ }
43
+ }
44
+
45
+ std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
46
+ auto num = count(x);
47
+ auto mean = reduce_sum(x) / num;
48
+ auto diff = x - broadcast_to(mean, x);
49
+ auto var = reduce_sum(diff.pow(2)) / num;
50
+
51
+ return {mean, var};
52
+ }
53
+
54
+ at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
55
+ bool affine, float eps) {
56
+ auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
57
+ auto mul = at::rsqrt(var + eps) * gamma;
58
+
59
+ x.sub_(broadcast_to(mean, x));
60
+ x.mul_(broadcast_to(mul, x));
61
+ if (affine) x.add_(broadcast_to(bias, x));
62
+
63
+ return x;
64
+ }
65
+
66
+ std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
67
+ bool affine, float eps) {
68
+ auto edz = reduce_sum(dz);
69
+ auto y = invert_affine(z, weight, bias, affine, eps);
70
+ auto eydz = reduce_sum(y * dz);
71
+
72
+ return {edz, eydz};
73
+ }
74
+
75
+ at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
76
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
77
+ auto y = invert_affine(z, weight, bias, affine, eps);
78
+ auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
79
+
80
+ auto num = count(z);
81
+ auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
82
+ return dx;
83
+ }
84
+
85
+ void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
86
+ CHECK_CPU_INPUT(z);
87
+ CHECK_CPU_INPUT(dz);
88
+
89
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
90
+ int64_t count = z.numel();
91
+ auto *_z = z.data<scalar_t>();
92
+ auto *_dz = dz.data<scalar_t>();
93
+
94
+ for (int64_t i = 0; i < count; ++i) {
95
+ if (_z[i] < 0) {
96
+ _z[i] *= 1 / slope;
97
+ _dz[i] *= slope;
98
+ }
99
+ }
100
+ }));
101
+ }
102
+
103
+ void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
104
+ CHECK_CPU_INPUT(z);
105
+ CHECK_CPU_INPUT(dz);
106
+
107
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
108
+ int64_t count = z.numel();
109
+ auto *_z = z.data<scalar_t>();
110
+ auto *_dz = dz.data<scalar_t>();
111
+
112
+ for (int64_t i = 0; i < count; ++i) {
113
+ if (_z[i] < 0) {
114
+ _z[i] = log1p(_z[i]);
115
+ _dz[i] *= (_z[i] + 1.f);
116
+ }
117
+ }
118
+ }));
119
+ }
preprocess/humanparsing/modules/src/inplace_abn_cuda.cu ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/ATen.h>
2
+
3
+ #include <thrust/device_ptr.h>
4
+ #include <thrust/transform.h>
5
+
6
+ #include <vector>
7
+
8
+ #include "utils/checks.h"
9
+ #include "utils/cuda.cuh"
10
+ #include "inplace_abn.h"
11
+
12
+ #include <ATen/cuda/CUDAContext.h>
13
+
14
+ // Operations for reduce
15
+ template<typename T>
16
+ struct SumOp {
17
+ __device__ SumOp(const T *t, int c, int s)
18
+ : tensor(t), chn(c), sp(s) {}
19
+ __device__ __forceinline__ T operator()(int batch, int plane, int n) {
20
+ return tensor[(batch * chn + plane) * sp + n];
21
+ }
22
+ const T *tensor;
23
+ const int chn;
24
+ const int sp;
25
+ };
26
+
27
+ template<typename T>
28
+ struct VarOp {
29
+ __device__ VarOp(T m, const T *t, int c, int s)
30
+ : mean(m), tensor(t), chn(c), sp(s) {}
31
+ __device__ __forceinline__ T operator()(int batch, int plane, int n) {
32
+ T val = tensor[(batch * chn + plane) * sp + n];
33
+ return (val - mean) * (val - mean);
34
+ }
35
+ const T mean;
36
+ const T *tensor;
37
+ const int chn;
38
+ const int sp;
39
+ };
40
+
41
+ template<typename T>
42
+ struct GradOp {
43
+ __device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
44
+ : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
45
+ __device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
46
+ T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
47
+ T _dz = dz[(batch * chn + plane) * sp + n];
48
+ return Pair<T>(_dz, _y * _dz);
49
+ }
50
+ const T weight;
51
+ const T bias;
52
+ const T *z;
53
+ const T *dz;
54
+ const int chn;
55
+ const int sp;
56
+ };
57
+
58
+ /***********
59
+ * mean_var
60
+ ***********/
61
+
62
+ template<typename T>
63
+ __global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
64
+ int plane = blockIdx.x;
65
+ T norm = T(1) / T(num * sp);
66
+
67
+ T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
68
+ __syncthreads();
69
+ T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
70
+
71
+ if (threadIdx.x == 0) {
72
+ mean[plane] = _mean;
73
+ var[plane] = _var;
74
+ }
75
+ }
76
+
77
+ std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
78
+ CHECK_CUDA_INPUT(x);
79
+
80
+ // Extract dimensions
81
+ int64_t num, chn, sp;
82
+ get_dims(x, num, chn, sp);
83
+
84
+ // Prepare output tensors
85
+ auto mean = at::empty({chn}, x.options());
86
+ auto var = at::empty({chn}, x.options());
87
+
88
+ // Run kernel
89
+ dim3 blocks(chn);
90
+ dim3 threads(getNumThreads(sp));
91
+ auto stream = at::cuda::getCurrentCUDAStream();
92
+ AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
93
+ mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
94
+ x.data<scalar_t>(),
95
+ mean.data<scalar_t>(),
96
+ var.data<scalar_t>(),
97
+ num, chn, sp);
98
+ }));
99
+
100
+ return {mean, var};
101
+ }
102
+
103
+ /**********
104
+ * forward
105
+ **********/
106
+
107
+ template<typename T>
108
+ __global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
109
+ bool affine, float eps, int num, int chn, int sp) {
110
+ int plane = blockIdx.x;
111
+
112
+ T _mean = mean[plane];
113
+ T _var = var[plane];
114
+ T _weight = affine ? abs(weight[plane]) + eps : T(1);
115
+ T _bias = affine ? bias[plane] : T(0);
116
+
117
+ T mul = rsqrt(_var + eps) * _weight;
118
+
119
+ for (int batch = 0; batch < num; ++batch) {
120
+ for (int n = threadIdx.x; n < sp; n += blockDim.x) {
121
+ T _x = x[(batch * chn + plane) * sp + n];
122
+ T _y = (_x - _mean) * mul + _bias;
123
+
124
+ x[(batch * chn + plane) * sp + n] = _y;
125
+ }
126
+ }
127
+ }
128
+
129
+ at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
130
+ bool affine, float eps) {
131
+ CHECK_CUDA_INPUT(x);
132
+ CHECK_CUDA_INPUT(mean);
133
+ CHECK_CUDA_INPUT(var);
134
+ CHECK_CUDA_INPUT(weight);
135
+ CHECK_CUDA_INPUT(bias);
136
+
137
+ // Extract dimensions
138
+ int64_t num, chn, sp;
139
+ get_dims(x, num, chn, sp);
140
+
141
+ // Run kernel
142
+ dim3 blocks(chn);
143
+ dim3 threads(getNumThreads(sp));
144
+ auto stream = at::cuda::getCurrentCUDAStream();
145
+ AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
146
+ forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
147
+ x.data<scalar_t>(),
148
+ mean.data<scalar_t>(),
149
+ var.data<scalar_t>(),
150
+ weight.data<scalar_t>(),
151
+ bias.data<scalar_t>(),
152
+ affine, eps, num, chn, sp);
153
+ }));
154
+
155
+ return x;
156
+ }
157
+
158
+ /***********
159
+ * edz_eydz
160
+ ***********/
161
+
162
+ template<typename T>
163
+ __global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
164
+ T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
165
+ int plane = blockIdx.x;
166
+
167
+ T _weight = affine ? abs(weight[plane]) + eps : 1.f;
168
+ T _bias = affine ? bias[plane] : 0.f;
169
+
170
+ Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
171
+ __syncthreads();
172
+
173
+ if (threadIdx.x == 0) {
174
+ edz[plane] = res.v1;
175
+ eydz[plane] = res.v2;
176
+ }
177
+ }
178
+
179
+ std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
180
+ bool affine, float eps) {
181
+ CHECK_CUDA_INPUT(z);
182
+ CHECK_CUDA_INPUT(dz);
183
+ CHECK_CUDA_INPUT(weight);
184
+ CHECK_CUDA_INPUT(bias);
185
+
186
+ // Extract dimensions
187
+ int64_t num, chn, sp;
188
+ get_dims(z, num, chn, sp);
189
+
190
+ auto edz = at::empty({chn}, z.options());
191
+ auto eydz = at::empty({chn}, z.options());
192
+
193
+ // Run kernel
194
+ dim3 blocks(chn);
195
+ dim3 threads(getNumThreads(sp));
196
+ auto stream = at::cuda::getCurrentCUDAStream();
197
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
198
+ edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
199
+ z.data<scalar_t>(),
200
+ dz.data<scalar_t>(),
201
+ weight.data<scalar_t>(),
202
+ bias.data<scalar_t>(),
203
+ edz.data<scalar_t>(),
204
+ eydz.data<scalar_t>(),
205
+ affine, eps, num, chn, sp);
206
+ }));
207
+
208
+ return {edz, eydz};
209
+ }
210
+
211
+ /***********
212
+ * backward
213
+ ***********/
214
+
215
+ template<typename T>
216
+ __global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
217
+ const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
218
+ int plane = blockIdx.x;
219
+
220
+ T _weight = affine ? abs(weight[plane]) + eps : 1.f;
221
+ T _bias = affine ? bias[plane] : 0.f;
222
+ T _var = var[plane];
223
+ T _edz = edz[plane];
224
+ T _eydz = eydz[plane];
225
+
226
+ T _mul = _weight * rsqrt(_var + eps);
227
+ T count = T(num * sp);
228
+
229
+ for (int batch = 0; batch < num; ++batch) {
230
+ for (int n = threadIdx.x; n < sp; n += blockDim.x) {
231
+ T _dz = dz[(batch * chn + plane) * sp + n];
232
+ T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
233
+
234
+ dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
235
+ }
236
+ }
237
+ }
238
+
239
+ at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
240
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
241
+ CHECK_CUDA_INPUT(z);
242
+ CHECK_CUDA_INPUT(dz);
243
+ CHECK_CUDA_INPUT(var);
244
+ CHECK_CUDA_INPUT(weight);
245
+ CHECK_CUDA_INPUT(bias);
246
+ CHECK_CUDA_INPUT(edz);
247
+ CHECK_CUDA_INPUT(eydz);
248
+
249
+ // Extract dimensions
250
+ int64_t num, chn, sp;
251
+ get_dims(z, num, chn, sp);
252
+
253
+ auto dx = at::zeros_like(z);
254
+
255
+ // Run kernel
256
+ dim3 blocks(chn);
257
+ dim3 threads(getNumThreads(sp));
258
+ auto stream = at::cuda::getCurrentCUDAStream();
259
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
260
+ backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
261
+ z.data<scalar_t>(),
262
+ dz.data<scalar_t>(),
263
+ var.data<scalar_t>(),
264
+ weight.data<scalar_t>(),
265
+ bias.data<scalar_t>(),
266
+ edz.data<scalar_t>(),
267
+ eydz.data<scalar_t>(),
268
+ dx.data<scalar_t>(),
269
+ affine, eps, num, chn, sp);
270
+ }));
271
+
272
+ return dx;
273
+ }
274
+
275
+ /**************
276
+ * activations
277
+ **************/
278
+
279
+ template<typename T>
280
+ inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
281
+ // Create thrust pointers
282
+ thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
283
+ thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
284
+
285
+ auto stream = at::cuda::getCurrentCUDAStream();
286
+ thrust::transform_if(thrust::cuda::par.on(stream),
287
+ th_dz, th_dz + count, th_z, th_dz,
288
+ [slope] __device__ (const T& dz) { return dz * slope; },
289
+ [] __device__ (const T& z) { return z < 0; });
290
+ thrust::transform_if(thrust::cuda::par.on(stream),
291
+ th_z, th_z + count, th_z,
292
+ [slope] __device__ (const T& z) { return z / slope; },
293
+ [] __device__ (const T& z) { return z < 0; });
294
+ }
295
+
296
+ void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
297
+ CHECK_CUDA_INPUT(z);
298
+ CHECK_CUDA_INPUT(dz);
299
+
300
+ int64_t count = z.numel();
301
+
302
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
303
+ leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
304
+ }));
305
+ }
306
+
307
+ template<typename T>
308
+ inline void elu_backward_impl(T *z, T *dz, int64_t count) {
309
+ // Create thrust pointers
310
+ thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
311
+ thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
312
+
313
+ auto stream = at::cuda::getCurrentCUDAStream();
314
+ thrust::transform_if(thrust::cuda::par.on(stream),
315
+ th_dz, th_dz + count, th_z, th_z, th_dz,
316
+ [] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
317
+ [] __device__ (const T& z) { return z < 0; });
318
+ thrust::transform_if(thrust::cuda::par.on(stream),
319
+ th_z, th_z + count, th_z,
320
+ [] __device__ (const T& z) { return log1p(z); },
321
+ [] __device__ (const T& z) { return z < 0; });
322
+ }
323
+
324
+ void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
325
+ CHECK_CUDA_INPUT(z);
326
+ CHECK_CUDA_INPUT(dz);
327
+
328
+ int64_t count = z.numel();
329
+
330
+ AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
331
+ elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
332
+ }));
333
+ }
preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/ATen.h>
2
+
3
+ #include <cuda_fp16.h>
4
+
5
+ #include <vector>
6
+
7
+ #include "utils/checks.h"
8
+ #include "utils/cuda.cuh"
9
+ #include "inplace_abn.h"
10
+
11
+ #include <ATen/cuda/CUDAContext.h>
12
+
13
+ // Operations for reduce
14
+ struct SumOpH {
15
+ __device__ SumOpH(const half *t, int c, int s)
16
+ : tensor(t), chn(c), sp(s) {}
17
+ __device__ __forceinline__ float operator()(int batch, int plane, int n) {
18
+ return __half2float(tensor[(batch * chn + plane) * sp + n]);
19
+ }
20
+ const half *tensor;
21
+ const int chn;
22
+ const int sp;
23
+ };
24
+
25
+ struct VarOpH {
26
+ __device__ VarOpH(float m, const half *t, int c, int s)
27
+ : mean(m), tensor(t), chn(c), sp(s) {}
28
+ __device__ __forceinline__ float operator()(int batch, int plane, int n) {
29
+ const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
30
+ return (t - mean) * (t - mean);
31
+ }
32
+ const float mean;
33
+ const half *tensor;
34
+ const int chn;
35
+ const int sp;
36
+ };
37
+
38
+ struct GradOpH {
39
+ __device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
40
+ : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
41
+ __device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
42
+ float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
43
+ float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
44
+ return Pair<float>(_dz, _y * _dz);
45
+ }
46
+ const float weight;
47
+ const float bias;
48
+ const half *z;
49
+ const half *dz;
50
+ const int chn;
51
+ const int sp;
52
+ };
53
+
54
+ /***********
55
+ * mean_var
56
+ ***********/
57
+
58
+ __global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
59
+ int plane = blockIdx.x;
60
+ float norm = 1.f / static_cast<float>(num * sp);
61
+
62
+ float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
63
+ __syncthreads();
64
+ float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
65
+
66
+ if (threadIdx.x == 0) {
67
+ mean[plane] = _mean;
68
+ var[plane] = _var;
69
+ }
70
+ }
71
+
72
+ std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
73
+ CHECK_CUDA_INPUT(x);
74
+
75
+ // Extract dimensions
76
+ int64_t num, chn, sp;
77
+ get_dims(x, num, chn, sp);
78
+
79
+ // Prepare output tensors
80
+ auto mean = at::empty({chn},x.options().dtype(at::kFloat));
81
+ auto var = at::empty({chn},x.options().dtype(at::kFloat));
82
+
83
+ // Run kernel
84
+ dim3 blocks(chn);
85
+ dim3 threads(getNumThreads(sp));
86
+ auto stream = at::cuda::getCurrentCUDAStream();
87
+ mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
88
+ reinterpret_cast<half*>(x.data<at::Half>()),
89
+ mean.data<float>(),
90
+ var.data<float>(),
91
+ num, chn, sp);
92
+
93
+ return {mean, var};
94
+ }
95
+
96
+ /**********
97
+ * forward
98
+ **********/
99
+
100
+ __global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
101
+ bool affine, float eps, int num, int chn, int sp) {
102
+ int plane = blockIdx.x;
103
+
104
+ const float _mean = mean[plane];
105
+ const float _var = var[plane];
106
+ const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
107
+ const float _bias = affine ? bias[plane] : 0.f;
108
+
109
+ const float mul = rsqrt(_var + eps) * _weight;
110
+
111
+ for (int batch = 0; batch < num; ++batch) {
112
+ for (int n = threadIdx.x; n < sp; n += blockDim.x) {
113
+ half *x_ptr = x + (batch * chn + plane) * sp + n;
114
+ float _x = __half2float(*x_ptr);
115
+ float _y = (_x - _mean) * mul + _bias;
116
+
117
+ *x_ptr = __float2half(_y);
118
+ }
119
+ }
120
+ }
121
+
122
+ at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
123
+ bool affine, float eps) {
124
+ CHECK_CUDA_INPUT(x);
125
+ CHECK_CUDA_INPUT(mean);
126
+ CHECK_CUDA_INPUT(var);
127
+ CHECK_CUDA_INPUT(weight);
128
+ CHECK_CUDA_INPUT(bias);
129
+
130
+ // Extract dimensions
131
+ int64_t num, chn, sp;
132
+ get_dims(x, num, chn, sp);
133
+
134
+ // Run kernel
135
+ dim3 blocks(chn);
136
+ dim3 threads(getNumThreads(sp));
137
+ auto stream = at::cuda::getCurrentCUDAStream();
138
+ forward_kernel_h<<<blocks, threads, 0, stream>>>(
139
+ reinterpret_cast<half*>(x.data<at::Half>()),
140
+ mean.data<float>(),
141
+ var.data<float>(),
142
+ weight.data<float>(),
143
+ bias.data<float>(),
144
+ affine, eps, num, chn, sp);
145
+
146
+ return x;
147
+ }
148
+
149
+ __global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
150
+ float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
151
+ int plane = blockIdx.x;
152
+
153
+ float _weight = affine ? abs(weight[plane]) + eps : 1.f;
154
+ float _bias = affine ? bias[plane] : 0.f;
155
+
156
+ Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
157
+ __syncthreads();
158
+
159
+ if (threadIdx.x == 0) {
160
+ edz[plane] = res.v1;
161
+ eydz[plane] = res.v2;
162
+ }
163
+ }
164
+
165
+ std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
166
+ bool affine, float eps) {
167
+ CHECK_CUDA_INPUT(z);
168
+ CHECK_CUDA_INPUT(dz);
169
+ CHECK_CUDA_INPUT(weight);
170
+ CHECK_CUDA_INPUT(bias);
171
+
172
+ // Extract dimensions
173
+ int64_t num, chn, sp;
174
+ get_dims(z, num, chn, sp);
175
+
176
+ auto edz = at::empty({chn},z.options().dtype(at::kFloat));
177
+ auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
178
+
179
+ // Run kernel
180
+ dim3 blocks(chn);
181
+ dim3 threads(getNumThreads(sp));
182
+ auto stream = at::cuda::getCurrentCUDAStream();
183
+ edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
184
+ reinterpret_cast<half*>(z.data<at::Half>()),
185
+ reinterpret_cast<half*>(dz.data<at::Half>()),
186
+ weight.data<float>(),
187
+ bias.data<float>(),
188
+ edz.data<float>(),
189
+ eydz.data<float>(),
190
+ affine, eps, num, chn, sp);
191
+
192
+ return {edz, eydz};
193
+ }
194
+
195
+ __global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
196
+ const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
197
+ int plane = blockIdx.x;
198
+
199
+ float _weight = affine ? abs(weight[plane]) + eps : 1.f;
200
+ float _bias = affine ? bias[plane] : 0.f;
201
+ float _var = var[plane];
202
+ float _edz = edz[plane];
203
+ float _eydz = eydz[plane];
204
+
205
+ float _mul = _weight * rsqrt(_var + eps);
206
+ float count = float(num * sp);
207
+
208
+ for (int batch = 0; batch < num; ++batch) {
209
+ for (int n = threadIdx.x; n < sp; n += blockDim.x) {
210
+ float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
211
+ float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
212
+
213
+ dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
214
+ }
215
+ }
216
+ }
217
+
218
+ at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
219
+ at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
220
+ CHECK_CUDA_INPUT(z);
221
+ CHECK_CUDA_INPUT(dz);
222
+ CHECK_CUDA_INPUT(var);
223
+ CHECK_CUDA_INPUT(weight);
224
+ CHECK_CUDA_INPUT(bias);
225
+ CHECK_CUDA_INPUT(edz);
226
+ CHECK_CUDA_INPUT(eydz);
227
+
228
+ // Extract dimensions
229
+ int64_t num, chn, sp;
230
+ get_dims(z, num, chn, sp);
231
+
232
+ auto dx = at::zeros_like(z);
233
+
234
+ // Run kernel
235
+ dim3 blocks(chn);
236
+ dim3 threads(getNumThreads(sp));
237
+ auto stream = at::cuda::getCurrentCUDAStream();
238
+ backward_kernel_h<<<blocks, threads, 0, stream>>>(
239
+ reinterpret_cast<half*>(z.data<at::Half>()),
240
+ reinterpret_cast<half*>(dz.data<at::Half>()),
241
+ var.data<float>(),
242
+ weight.data<float>(),
243
+ bias.data<float>(),
244
+ edz.data<float>(),
245
+ eydz.data<float>(),
246
+ reinterpret_cast<half*>(dx.data<at::Half>()),
247
+ affine, eps, num, chn, sp);
248
+
249
+ return dx;
250
+ }
251
+
252
+ __global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
253
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x){
254
+ float _z = __half2float(z[i]);
255
+ if (_z < 0) {
256
+ dz[i] = __float2half(__half2float(dz[i]) * slope);
257
+ z[i] = __float2half(_z / slope);
258
+ }
259
+ }
260
+ }
261
+
262
+ void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
263
+ CHECK_CUDA_INPUT(z);
264
+ CHECK_CUDA_INPUT(dz);
265
+
266
+ int64_t count = z.numel();
267
+ dim3 threads(getNumThreads(count));
268
+ dim3 blocks = (count + threads.x - 1) / threads.x;
269
+ auto stream = at::cuda::getCurrentCUDAStream();
270
+ leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
271
+ reinterpret_cast<half*>(z.data<at::Half>()),
272
+ reinterpret_cast<half*>(dz.data<at::Half>()),
273
+ slope, count);
274
+ }
275
+
preprocess/humanparsing/modules/src/utils/checks.h ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/ATen.h>
4
+
5
+ // Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
6
+ #ifndef AT_CHECK
7
+ #define AT_CHECK AT_ASSERT
8
+ #endif
9
+
10
+ #define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
11
+ #define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
12
+ #define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
13
+
14
+ #define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
15
+ #define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)
preprocess/humanparsing/modules/src/utils/common.h ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/ATen.h>
4
+
5
+ /*
6
+ * Functions to share code between CPU and GPU
7
+ */
8
+
9
+ #ifdef __CUDACC__
10
+ // CUDA versions
11
+
12
+ #define HOST_DEVICE __host__ __device__
13
+ #define INLINE_HOST_DEVICE __host__ __device__ inline
14
+ #define FLOOR(x) floor(x)
15
+
16
+ #if __CUDA_ARCH__ >= 600
17
+ // Recent compute capabilities have block-level atomicAdd for all data types, so we use that
18
+ #define ACCUM(x,y) atomicAdd_block(&(x),(y))
19
+ #else
20
+ // Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
21
+ // and use the known atomicCAS-based implementation for double
22
+ template<typename data_t>
23
+ __device__ inline data_t atomic_add(data_t *address, data_t val) {
24
+ return atomicAdd(address, val);
25
+ }
26
+
27
+ template<>
28
+ __device__ inline double atomic_add(double *address, double val) {
29
+ unsigned long long int* address_as_ull = (unsigned long long int*)address;
30
+ unsigned long long int old = *address_as_ull, assumed;
31
+ do {
32
+ assumed = old;
33
+ old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
34
+ } while (assumed != old);
35
+ return __longlong_as_double(old);
36
+ }
37
+
38
+ #define ACCUM(x,y) atomic_add(&(x),(y))
39
+ #endif // #if __CUDA_ARCH__ >= 600
40
+
41
+ #else
42
+ // CPU versions
43
+
44
+ #define HOST_DEVICE
45
+ #define INLINE_HOST_DEVICE inline
46
+ #define FLOOR(x) std::floor(x)
47
+ #define ACCUM(x,y) (x) += (y)
48
+
49
+ #endif // #ifdef __CUDACC__
preprocess/humanparsing/modules/src/utils/cuda.cuh ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ /*
4
+ * General settings and functions
5
+ */
6
+ const int WARP_SIZE = 32;
7
+ const int MAX_BLOCK_SIZE = 1024;
8
+
9
+ static int getNumThreads(int nElem) {
10
+ int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
11
+ for (int i = 0; i < 6; ++i) {
12
+ if (nElem <= threadSizes[i]) {
13
+ return threadSizes[i];
14
+ }
15
+ }
16
+ return MAX_BLOCK_SIZE;
17
+ }
18
+
19
+ /*
20
+ * Reduction utilities
21
+ */
22
+ template <typename T>
23
+ __device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
24
+ unsigned int mask = 0xffffffff) {
25
+ #if CUDART_VERSION >= 9000
26
+ return __shfl_xor_sync(mask, value, laneMask, width);
27
+ #else
28
+ return __shfl_xor(value, laneMask, width);
29
+ #endif
30
+ }
31
+
32
+ __device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
33
+
34
+ template<typename T>
35
+ struct Pair {
36
+ T v1, v2;
37
+ __device__ Pair() {}
38
+ __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
39
+ __device__ Pair(T v) : v1(v), v2(v) {}
40
+ __device__ Pair(int v) : v1(v), v2(v) {}
41
+ __device__ Pair &operator+=(const Pair<T> &a) {
42
+ v1 += a.v1;
43
+ v2 += a.v2;
44
+ return *this;
45
+ }
46
+ };
47
+
48
+ template<typename T>
49
+ static __device__ __forceinline__ T warpSum(T val) {
50
+ #if __CUDA_ARCH__ >= 300
51
+ for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
52
+ val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
53
+ }
54
+ #else
55
+ __shared__ T values[MAX_BLOCK_SIZE];
56
+ values[threadIdx.x] = val;
57
+ __threadfence_block();
58
+ const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
59
+ for (int i = 1; i < WARP_SIZE; i++) {
60
+ val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
61
+ }
62
+ #endif
63
+ return val;
64
+ }
65
+
66
+ template<typename T>
67
+ static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
68
+ value.v1 = warpSum(value.v1);
69
+ value.v2 = warpSum(value.v2);
70
+ return value;
71
+ }
preprocess/humanparsing/networks/AugmentCE2P.py ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : AugmentCE2P.py
8
+ @Time : 8/4/19 3:35 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import functools
15
+ import pdb
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+ from torch.nn import functional as F
20
+ # Note here we adopt the InplaceABNSync implementation from https://github.com/mapillary/inplace_abn
21
+ # By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer
22
+ from modules import InPlaceABNSync
23
+ import numpy as np
24
+
25
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
26
+
27
+ affine_par = True
28
+
29
+ pretrained_settings = {
30
+ 'resnet101': {
31
+ 'imagenet': {
32
+ 'input_space': 'BGR',
33
+ 'input_size': [3, 224, 224],
34
+ 'input_range': [0, 1],
35
+ 'mean': [0.406, 0.456, 0.485],
36
+ 'std': [0.225, 0.224, 0.229],
37
+ 'num_classes': 1000
38
+ }
39
+ },
40
+ }
41
+
42
+
43
+ def conv3x3(in_planes, out_planes, stride=1):
44
+ "3x3 convolution with padding"
45
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
46
+ padding=1, bias=False)
47
+
48
+
49
+ class Bottleneck(nn.Module):
50
+ expansion = 4
51
+
52
+ def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
53
+ super(Bottleneck, self).__init__()
54
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
55
+ self.bn1 = BatchNorm2d(planes)
56
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
57
+ padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False)
58
+ self.bn2 = BatchNorm2d(planes)
59
+ self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
60
+ self.bn3 = BatchNorm2d(planes * 4)
61
+ self.relu = nn.ReLU(inplace=False)
62
+ self.relu_inplace = nn.ReLU(inplace=True)
63
+ self.downsample = downsample
64
+ self.dilation = dilation
65
+ self.stride = stride
66
+
67
+ def forward(self, x):
68
+ residual = x
69
+
70
+ out = self.conv1(x)
71
+ out = self.bn1(out)
72
+ out = self.relu(out)
73
+
74
+ out = self.conv2(out)
75
+ out = self.bn2(out)
76
+ out = self.relu(out)
77
+
78
+ out = self.conv3(out)
79
+ out = self.bn3(out)
80
+
81
+ if self.downsample is not None:
82
+ residual = self.downsample(x)
83
+
84
+ out = out + residual
85
+ out = self.relu_inplace(out)
86
+
87
+ return out
88
+
89
+
90
+ class CostomAdaptiveAvgPool2D(nn.Module):
91
+
92
+ def __init__(self, output_size):
93
+
94
+ super(CostomAdaptiveAvgPool2D, self).__init__()
95
+
96
+ self.output_size = output_size
97
+
98
+ def forward(self, x):
99
+
100
+ H_in, W_in = x.shape[-2:]
101
+ H_out, W_out = self.output_size
102
+
103
+ out_i = []
104
+ for i in range(H_out):
105
+ out_j = []
106
+ for j in range(W_out):
107
+ hs = int(np.floor(i * H_in / H_out))
108
+ he = int(np.ceil((i + 1) * H_in / H_out))
109
+
110
+ ws = int(np.floor(j * W_in / W_out))
111
+ we = int(np.ceil((j + 1) * W_in / W_out))
112
+
113
+ # print(hs, he, ws, we)
114
+ kernel_size = [he - hs, we - ws]
115
+
116
+ out = F.avg_pool2d(x[:, :, hs:he, ws:we], kernel_size)
117
+ out_j.append(out)
118
+
119
+ out_j = torch.concat(out_j, -1)
120
+ out_i.append(out_j)
121
+
122
+ out_i = torch.concat(out_i, -2)
123
+ return out_i
124
+
125
+
126
+ class PSPModule(nn.Module):
127
+ """
128
+ Reference:
129
+ Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
130
+ """
131
+
132
+ def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
133
+ super(PSPModule, self).__init__()
134
+
135
+ self.stages = []
136
+ tmp = []
137
+ for size in sizes:
138
+ if size == 3 or size == 6:
139
+ tmp.append(self._make_stage_custom(features, out_features, size))
140
+ else:
141
+ tmp.append(self._make_stage(features, out_features, size))
142
+ self.stages = nn.ModuleList(tmp)
143
+ # self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
144
+ self.bottleneck = nn.Sequential(
145
+ nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
146
+ bias=False),
147
+ InPlaceABNSync(out_features),
148
+ )
149
+
150
+ def _make_stage(self, features, out_features, size):
151
+ prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
152
+ conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
153
+ bn = InPlaceABNSync(out_features)
154
+ return nn.Sequential(prior, conv, bn)
155
+
156
+ def _make_stage_custom(self, features, out_features, size):
157
+ prior = CostomAdaptiveAvgPool2D(output_size=(size, size))
158
+ conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
159
+ bn = InPlaceABNSync(out_features)
160
+ return nn.Sequential(prior, conv, bn)
161
+
162
+ def forward(self, feats):
163
+ h, w = feats.size(2), feats.size(3)
164
+ priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
165
+ self.stages] + [feats]
166
+ bottle = self.bottleneck(torch.cat(priors, 1))
167
+ return bottle
168
+
169
+
170
+ class ASPPModule(nn.Module):
171
+ """
172
+ Reference:
173
+ Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
174
+ """
175
+
176
+ def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)):
177
+ super(ASPPModule, self).__init__()
178
+
179
+ self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
180
+ nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
181
+ bias=False),
182
+ InPlaceABNSync(inner_features))
183
+ self.conv2 = nn.Sequential(
184
+ nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
185
+ InPlaceABNSync(inner_features))
186
+ self.conv3 = nn.Sequential(
187
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
188
+ InPlaceABNSync(inner_features))
189
+ self.conv4 = nn.Sequential(
190
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
191
+ InPlaceABNSync(inner_features))
192
+ self.conv5 = nn.Sequential(
193
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
194
+ InPlaceABNSync(inner_features))
195
+
196
+ self.bottleneck = nn.Sequential(
197
+ nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
198
+ InPlaceABNSync(out_features),
199
+ nn.Dropout2d(0.1)
200
+ )
201
+
202
+ def forward(self, x):
203
+ _, _, h, w = x.size()
204
+
205
+ feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
206
+
207
+ feat2 = self.conv2(x)
208
+ feat3 = self.conv3(x)
209
+ feat4 = self.conv4(x)
210
+ feat5 = self.conv5(x)
211
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
212
+
213
+ bottle = self.bottleneck(out)
214
+ return bottle
215
+
216
+
217
+ class Edge_Module(nn.Module):
218
+ """
219
+ Edge Learning Branch
220
+ """
221
+
222
+ def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2):
223
+ super(Edge_Module, self).__init__()
224
+
225
+ self.conv1 = nn.Sequential(
226
+ nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
227
+ InPlaceABNSync(mid_fea)
228
+ )
229
+ self.conv2 = nn.Sequential(
230
+ nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
231
+ InPlaceABNSync(mid_fea)
232
+ )
233
+ self.conv3 = nn.Sequential(
234
+ nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
235
+ InPlaceABNSync(mid_fea)
236
+ )
237
+ self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
238
+ self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
239
+
240
+ def forward(self, x1, x2, x3):
241
+ _, _, h, w = x1.size()
242
+
243
+ edge1_fea = self.conv1(x1)
244
+ edge1 = self.conv4(edge1_fea)
245
+ edge2_fea = self.conv2(x2)
246
+ edge2 = self.conv4(edge2_fea)
247
+ edge3_fea = self.conv3(x3)
248
+ edge3 = self.conv4(edge3_fea)
249
+
250
+ edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True)
251
+ edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True)
252
+ edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True)
253
+ edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True)
254
+
255
+ edge = torch.cat([edge1, edge2, edge3], dim=1)
256
+ edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
257
+ edge = self.conv5(edge)
258
+
259
+ return edge, edge_fea
260
+
261
+
262
+ class Decoder_Module(nn.Module):
263
+ """
264
+ Parsing Branch Decoder Module.
265
+ """
266
+
267
+ def __init__(self, num_classes):
268
+ super(Decoder_Module, self).__init__()
269
+ self.conv1 = nn.Sequential(
270
+ nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False),
271
+ InPlaceABNSync(256)
272
+ )
273
+ self.conv2 = nn.Sequential(
274
+ nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
275
+ InPlaceABNSync(48)
276
+ )
277
+ self.conv3 = nn.Sequential(
278
+ nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
279
+ InPlaceABNSync(256),
280
+ nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
281
+ InPlaceABNSync(256)
282
+ )
283
+
284
+ self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
285
+
286
+ def forward(self, xt, xl):
287
+ _, _, h, w = xl.size()
288
+ xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
289
+ xl = self.conv2(xl)
290
+ x = torch.cat([xt, xl], dim=1)
291
+ x = self.conv3(x)
292
+ seg = self.conv4(x)
293
+ return seg, x
294
+
295
+
296
+ class ResNet(nn.Module):
297
+ def __init__(self, block, layers, num_classes):
298
+ self.inplanes = 128
299
+ super(ResNet, self).__init__()
300
+ self.conv1 = conv3x3(3, 64, stride=2)
301
+ self.bn1 = BatchNorm2d(64)
302
+ self.relu1 = nn.ReLU(inplace=False)
303
+ self.conv2 = conv3x3(64, 64)
304
+ self.bn2 = BatchNorm2d(64)
305
+ self.relu2 = nn.ReLU(inplace=False)
306
+ self.conv3 = conv3x3(64, 128)
307
+ self.bn3 = BatchNorm2d(128)
308
+ self.relu3 = nn.ReLU(inplace=False)
309
+
310
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
311
+
312
+ self.layer1 = self._make_layer(block, 64, layers[0])
313
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
314
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
315
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1))
316
+
317
+ self.context_encoding = PSPModule(2048, 512)
318
+
319
+ self.edge = Edge_Module()
320
+ self.decoder = Decoder_Module(num_classes)
321
+
322
+ self.fushion = nn.Sequential(
323
+ nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False),
324
+ InPlaceABNSync(256),
325
+ nn.Dropout2d(0.1),
326
+ nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
327
+ )
328
+
329
+ def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
330
+ downsample = None
331
+ if stride != 1 or self.inplanes != planes * block.expansion:
332
+ downsample = nn.Sequential(
333
+ nn.Conv2d(self.inplanes, planes * block.expansion,
334
+ kernel_size=1, stride=stride, bias=False),
335
+ BatchNorm2d(planes * block.expansion, affine=affine_par))
336
+
337
+ layers = []
338
+ generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1
339
+ layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample,
340
+ multi_grid=generate_multi_grid(0, multi_grid)))
341
+ self.inplanes = planes * block.expansion
342
+ for i in range(1, blocks):
343
+ layers.append(
344
+ block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
345
+
346
+ return nn.Sequential(*layers)
347
+
348
+ def forward(self, x):
349
+ x = self.relu1(self.bn1(self.conv1(x)))
350
+ x = self.relu2(self.bn2(self.conv2(x)))
351
+ x = self.relu3(self.bn3(self.conv3(x)))
352
+ x = self.maxpool(x)
353
+ x2 = self.layer1(x)
354
+ x3 = self.layer2(x2)
355
+ x4 = self.layer3(x3)
356
+ x5 = self.layer4(x4)
357
+ x = self.context_encoding(x5)
358
+ parsing_result, parsing_fea = self.decoder(x, x2)
359
+ # Edge Branch
360
+ edge_result, edge_fea = self.edge(x2, x3, x4)
361
+ # Fusion Branch
362
+ x = torch.cat([parsing_fea, edge_fea], dim=1)
363
+ fusion_result = self.fushion(x)
364
+ return [[parsing_result, fusion_result], edge_result]
365
+
366
+
367
+ def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'):
368
+ model.input_space = settings['input_space']
369
+ model.input_size = settings['input_size']
370
+ model.input_range = settings['input_range']
371
+ model.mean = settings['mean']
372
+ model.std = settings['std']
373
+
374
+ if pretrained is not None:
375
+ saved_state_dict = torch.load(pretrained)
376
+ new_params = model.state_dict().copy()
377
+ for i in saved_state_dict:
378
+ i_parts = i.split('.')
379
+ if not i_parts[0] == 'fc':
380
+ new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
381
+ model.load_state_dict(new_params)
382
+
383
+
384
+ def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'):
385
+ model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
386
+ settings = pretrained_settings['resnet101']['imagenet']
387
+ initialize_pretrained_model(model, settings, pretrained)
388
+ return model
preprocess/humanparsing/networks/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import absolute_import
2
+ from networks.AugmentCE2P import resnet101
3
+
4
+ __factory = {
5
+ 'resnet101': resnet101,
6
+ }
7
+
8
+
9
+ def init_model(name, *args, **kwargs):
10
+ if name not in __factory.keys():
11
+ raise KeyError("Unknown model arch: {}".format(name))
12
+ return __factory[name](*args, **kwargs)
preprocess/humanparsing/networks/backbone/mobilenetv2.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : mobilenetv2.py
8
+ @Time : 8/4/19 3:35 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import torch.nn as nn
15
+ import math
16
+ import functools
17
+
18
+ from modules import InPlaceABN, InPlaceABNSync
19
+
20
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
21
+
22
+ __all__ = ['mobilenetv2']
23
+
24
+
25
+ def conv_bn(inp, oup, stride):
26
+ return nn.Sequential(
27
+ nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
28
+ BatchNorm2d(oup),
29
+ nn.ReLU6(inplace=True)
30
+ )
31
+
32
+
33
+ def conv_1x1_bn(inp, oup):
34
+ return nn.Sequential(
35
+ nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
36
+ BatchNorm2d(oup),
37
+ nn.ReLU6(inplace=True)
38
+ )
39
+
40
+
41
+ class InvertedResidual(nn.Module):
42
+ def __init__(self, inp, oup, stride, expand_ratio):
43
+ super(InvertedResidual, self).__init__()
44
+ self.stride = stride
45
+ assert stride in [1, 2]
46
+
47
+ hidden_dim = round(inp * expand_ratio)
48
+ self.use_res_connect = self.stride == 1 and inp == oup
49
+
50
+ if expand_ratio == 1:
51
+ self.conv = nn.Sequential(
52
+ # dw
53
+ nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
54
+ BatchNorm2d(hidden_dim),
55
+ nn.ReLU6(inplace=True),
56
+ # pw-linear
57
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
58
+ BatchNorm2d(oup),
59
+ )
60
+ else:
61
+ self.conv = nn.Sequential(
62
+ # pw
63
+ nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
64
+ BatchNorm2d(hidden_dim),
65
+ nn.ReLU6(inplace=True),
66
+ # dw
67
+ nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
68
+ BatchNorm2d(hidden_dim),
69
+ nn.ReLU6(inplace=True),
70
+ # pw-linear
71
+ nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
72
+ BatchNorm2d(oup),
73
+ )
74
+
75
+ def forward(self, x):
76
+ if self.use_res_connect:
77
+ return x + self.conv(x)
78
+ else:
79
+ return self.conv(x)
80
+
81
+
82
+ class MobileNetV2(nn.Module):
83
+ def __init__(self, n_class=1000, input_size=224, width_mult=1.):
84
+ super(MobileNetV2, self).__init__()
85
+ block = InvertedResidual
86
+ input_channel = 32
87
+ last_channel = 1280
88
+ interverted_residual_setting = [
89
+ # t, c, n, s
90
+ [1, 16, 1, 1],
91
+ [6, 24, 2, 2], # layer 2
92
+ [6, 32, 3, 2], # layer 3
93
+ [6, 64, 4, 2],
94
+ [6, 96, 3, 1], # layer 4
95
+ [6, 160, 3, 2],
96
+ [6, 320, 1, 1], # layer 5
97
+ ]
98
+
99
+ # building first layer
100
+ assert input_size % 32 == 0
101
+ input_channel = int(input_channel * width_mult)
102
+ self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
103
+ self.features = [conv_bn(3, input_channel, 2)]
104
+ # building inverted residual blocks
105
+ for t, c, n, s in interverted_residual_setting:
106
+ output_channel = int(c * width_mult)
107
+ for i in range(n):
108
+ if i == 0:
109
+ self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
110
+ else:
111
+ self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
112
+ input_channel = output_channel
113
+ # building last several layers
114
+ self.features.append(conv_1x1_bn(input_channel, self.last_channel))
115
+ # make it nn.Sequential
116
+ self.features = nn.Sequential(*self.features)
117
+
118
+ # building classifier
119
+ self.classifier = nn.Sequential(
120
+ nn.Dropout(0.2),
121
+ nn.Linear(self.last_channel, n_class),
122
+ )
123
+
124
+ self._initialize_weights()
125
+
126
+ def forward(self, x):
127
+ x = self.features(x)
128
+ x = x.mean(3).mean(2)
129
+ x = self.classifier(x)
130
+ return x
131
+
132
+ def _initialize_weights(self):
133
+ for m in self.modules():
134
+ if isinstance(m, nn.Conv2d):
135
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
136
+ m.weight.data.normal_(0, math.sqrt(2. / n))
137
+ if m.bias is not None:
138
+ m.bias.data.zero_()
139
+ elif isinstance(m, BatchNorm2d):
140
+ m.weight.data.fill_(1)
141
+ m.bias.data.zero_()
142
+ elif isinstance(m, nn.Linear):
143
+ n = m.weight.size(1)
144
+ m.weight.data.normal_(0, 0.01)
145
+ m.bias.data.zero_()
146
+
147
+
148
+ def mobilenetv2(pretrained=False, **kwargs):
149
+ """Constructs a MobileNet_V2 model.
150
+ Args:
151
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
152
+ """
153
+ model = MobileNetV2(n_class=1000, **kwargs)
154
+ if pretrained:
155
+ model.load_state_dict(load_url(model_urls['mobilenetv2']), strict=False)
156
+ return model
preprocess/humanparsing/networks/backbone/resnet.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : resnet.py
8
+ @Time : 8/4/19 3:35 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import functools
15
+ import torch.nn as nn
16
+ import math
17
+ from torch.utils.model_zoo import load_url
18
+
19
+ from modules import InPlaceABNSync
20
+
21
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
22
+
23
+ __all__ = ['ResNet', 'resnet18', 'resnet50', 'resnet101'] # resnet101 is coming soon!
24
+
25
+ model_urls = {
26
+ 'resnet18': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet18-imagenet.pth',
27
+ 'resnet50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet50-imagenet.pth',
28
+ 'resnet101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet101-imagenet.pth'
29
+ }
30
+
31
+
32
+ def conv3x3(in_planes, out_planes, stride=1):
33
+ "3x3 convolution with padding"
34
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
35
+ padding=1, bias=False)
36
+
37
+
38
+ class BasicBlock(nn.Module):
39
+ expansion = 1
40
+
41
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
42
+ super(BasicBlock, self).__init__()
43
+ self.conv1 = conv3x3(inplanes, planes, stride)
44
+ self.bn1 = BatchNorm2d(planes)
45
+ self.relu = nn.ReLU(inplace=True)
46
+ self.conv2 = conv3x3(planes, planes)
47
+ self.bn2 = BatchNorm2d(planes)
48
+ self.downsample = downsample
49
+ self.stride = stride
50
+
51
+ def forward(self, x):
52
+ residual = x
53
+
54
+ out = self.conv1(x)
55
+ out = self.bn1(out)
56
+ out = self.relu(out)
57
+
58
+ out = self.conv2(out)
59
+ out = self.bn2(out)
60
+
61
+ if self.downsample is not None:
62
+ residual = self.downsample(x)
63
+
64
+ out += residual
65
+ out = self.relu(out)
66
+
67
+ return out
68
+
69
+
70
+ class Bottleneck(nn.Module):
71
+ expansion = 4
72
+
73
+ def __init__(self, inplanes, planes, stride=1, downsample=None):
74
+ super(Bottleneck, self).__init__()
75
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
76
+ self.bn1 = BatchNorm2d(planes)
77
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
78
+ padding=1, bias=False)
79
+ self.bn2 = BatchNorm2d(planes)
80
+ self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
81
+ self.bn3 = BatchNorm2d(planes * 4)
82
+ self.relu = nn.ReLU(inplace=True)
83
+ self.downsample = downsample
84
+ self.stride = stride
85
+
86
+ def forward(self, x):
87
+ residual = x
88
+
89
+ out = self.conv1(x)
90
+ out = self.bn1(out)
91
+ out = self.relu(out)
92
+
93
+ out = self.conv2(out)
94
+ out = self.bn2(out)
95
+ out = self.relu(out)
96
+
97
+ out = self.conv3(out)
98
+ out = self.bn3(out)
99
+
100
+ if self.downsample is not None:
101
+ residual = self.downsample(x)
102
+
103
+ out += residual
104
+ out = self.relu(out)
105
+
106
+ return out
107
+
108
+
109
+ class ResNet(nn.Module):
110
+
111
+ def __init__(self, block, layers, num_classes=1000):
112
+ self.inplanes = 128
113
+ super(ResNet, self).__init__()
114
+ self.conv1 = conv3x3(3, 64, stride=2)
115
+ self.bn1 = BatchNorm2d(64)
116
+ self.relu1 = nn.ReLU(inplace=True)
117
+ self.conv2 = conv3x3(64, 64)
118
+ self.bn2 = BatchNorm2d(64)
119
+ self.relu2 = nn.ReLU(inplace=True)
120
+ self.conv3 = conv3x3(64, 128)
121
+ self.bn3 = BatchNorm2d(128)
122
+ self.relu3 = nn.ReLU(inplace=True)
123
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
124
+
125
+ self.layer1 = self._make_layer(block, 64, layers[0])
126
+ self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
127
+ self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
128
+ self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
129
+ self.avgpool = nn.AvgPool2d(7, stride=1)
130
+ self.fc = nn.Linear(512 * block.expansion, num_classes)
131
+
132
+ for m in self.modules():
133
+ if isinstance(m, nn.Conv2d):
134
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
135
+ m.weight.data.normal_(0, math.sqrt(2. / n))
136
+ elif isinstance(m, BatchNorm2d):
137
+ m.weight.data.fill_(1)
138
+ m.bias.data.zero_()
139
+
140
+ def _make_layer(self, block, planes, blocks, stride=1):
141
+ downsample = None
142
+ if stride != 1 or self.inplanes != planes * block.expansion:
143
+ downsample = nn.Sequential(
144
+ nn.Conv2d(self.inplanes, planes * block.expansion,
145
+ kernel_size=1, stride=stride, bias=False),
146
+ BatchNorm2d(planes * block.expansion),
147
+ )
148
+
149
+ layers = []
150
+ layers.append(block(self.inplanes, planes, stride, downsample))
151
+ self.inplanes = planes * block.expansion
152
+ for i in range(1, blocks):
153
+ layers.append(block(self.inplanes, planes))
154
+
155
+ return nn.Sequential(*layers)
156
+
157
+ def forward(self, x):
158
+ x = self.relu1(self.bn1(self.conv1(x)))
159
+ x = self.relu2(self.bn2(self.conv2(x)))
160
+ x = self.relu3(self.bn3(self.conv3(x)))
161
+ x = self.maxpool(x)
162
+
163
+ x = self.layer1(x)
164
+ x = self.layer2(x)
165
+ x = self.layer3(x)
166
+ x = self.layer4(x)
167
+
168
+ x = self.avgpool(x)
169
+ x = x.view(x.size(0), -1)
170
+ x = self.fc(x)
171
+
172
+ return x
173
+
174
+
175
+ def resnet18(pretrained=False, **kwargs):
176
+ """Constructs a ResNet-18 model.
177
+ Args:
178
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
179
+ """
180
+ model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
181
+ if pretrained:
182
+ model.load_state_dict(load_url(model_urls['resnet18']))
183
+ return model
184
+
185
+
186
+ def resnet50(pretrained=False, **kwargs):
187
+ """Constructs a ResNet-50 model.
188
+ Args:
189
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
190
+ """
191
+ model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
192
+ if pretrained:
193
+ model.load_state_dict(load_url(model_urls['resnet50']), strict=False)
194
+ return model
195
+
196
+
197
+ def resnet101(pretrained=False, **kwargs):
198
+ """Constructs a ResNet-101 model.
199
+ Args:
200
+ pretrained (bool): If True, returns a model pre-trained on ImageNet
201
+ """
202
+ model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
203
+ if pretrained:
204
+ model.load_state_dict(load_url(model_urls['resnet101']), strict=False)
205
+ return model
preprocess/humanparsing/networks/backbone/resnext.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : resnext.py.py
8
+ @Time : 8/11/19 8:58 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+ import functools
14
+ import torch.nn as nn
15
+ import math
16
+ from torch.utils.model_zoo import load_url
17
+
18
+ from modules import InPlaceABNSync
19
+
20
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
21
+
22
+ __all__ = ['ResNeXt', 'resnext101'] # support resnext 101
23
+
24
+ model_urls = {
25
+ 'resnext50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext50-imagenet.pth',
26
+ 'resnext101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext101-imagenet.pth'
27
+ }
28
+
29
+
30
+ def conv3x3(in_planes, out_planes, stride=1):
31
+ "3x3 convolution with padding"
32
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
33
+ padding=1, bias=False)
34
+
35
+
36
+ class GroupBottleneck(nn.Module):
37
+ expansion = 2
38
+
39
+ def __init__(self, inplanes, planes, stride=1, groups=1, downsample=None):
40
+ super(GroupBottleneck, self).__init__()
41
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
42
+ self.bn1 = BatchNorm2d(planes)
43
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
44
+ padding=1, groups=groups, bias=False)
45
+ self.bn2 = BatchNorm2d(planes)
46
+ self.conv3 = nn.Conv2d(planes, planes * 2, kernel_size=1, bias=False)
47
+ self.bn3 = BatchNorm2d(planes * 2)
48
+ self.relu = nn.ReLU(inplace=True)
49
+ self.downsample = downsample
50
+ self.stride = stride
51
+
52
+ def forward(self, x):
53
+ residual = x
54
+
55
+ out = self.conv1(x)
56
+ out = self.bn1(out)
57
+ out = self.relu(out)
58
+
59
+ out = self.conv2(out)
60
+ out = self.bn2(out)
61
+ out = self.relu(out)
62
+
63
+ out = self.conv3(out)
64
+ out = self.bn3(out)
65
+
66
+ if self.downsample is not None:
67
+ residual = self.downsample(x)
68
+
69
+ out += residual
70
+ out = self.relu(out)
71
+
72
+ return out
73
+
74
+
75
+ class ResNeXt(nn.Module):
76
+
77
+ def __init__(self, block, layers, groups=32, num_classes=1000):
78
+ self.inplanes = 128
79
+ super(ResNeXt, self).__init__()
80
+ self.conv1 = conv3x3(3, 64, stride=2)
81
+ self.bn1 = BatchNorm2d(64)
82
+ self.relu1 = nn.ReLU(inplace=True)
83
+ self.conv2 = conv3x3(64, 64)
84
+ self.bn2 = BatchNorm2d(64)
85
+ self.relu2 = nn.ReLU(inplace=True)
86
+ self.conv3 = conv3x3(64, 128)
87
+ self.bn3 = BatchNorm2d(128)
88
+ self.relu3 = nn.ReLU(inplace=True)
89
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
90
+
91
+ self.layer1 = self._make_layer(block, 128, layers[0], groups=groups)
92
+ self.layer2 = self._make_layer(block, 256, layers[1], stride=2, groups=groups)
93
+ self.layer3 = self._make_layer(block, 512, layers[2], stride=2, groups=groups)
94
+ self.layer4 = self._make_layer(block, 1024, layers[3], stride=2, groups=groups)
95
+ self.avgpool = nn.AvgPool2d(7, stride=1)
96
+ self.fc = nn.Linear(1024 * block.expansion, num_classes)
97
+
98
+ for m in self.modules():
99
+ if isinstance(m, nn.Conv2d):
100
+ n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels // m.groups
101
+ m.weight.data.normal_(0, math.sqrt(2. / n))
102
+ elif isinstance(m, BatchNorm2d):
103
+ m.weight.data.fill_(1)
104
+ m.bias.data.zero_()
105
+
106
+ def _make_layer(self, block, planes, blocks, stride=1, groups=1):
107
+ downsample = None
108
+ if stride != 1 or self.inplanes != planes * block.expansion:
109
+ downsample = nn.Sequential(
110
+ nn.Conv2d(self.inplanes, planes * block.expansion,
111
+ kernel_size=1, stride=stride, bias=False),
112
+ BatchNorm2d(planes * block.expansion),
113
+ )
114
+
115
+ layers = []
116
+ layers.append(block(self.inplanes, planes, stride, groups, downsample))
117
+ self.inplanes = planes * block.expansion
118
+ for i in range(1, blocks):
119
+ layers.append(block(self.inplanes, planes, groups=groups))
120
+
121
+ return nn.Sequential(*layers)
122
+
123
+ def forward(self, x):
124
+ x = self.relu1(self.bn1(self.conv1(x)))
125
+ x = self.relu2(self.bn2(self.conv2(x)))
126
+ x = self.relu3(self.bn3(self.conv3(x)))
127
+ x = self.maxpool(x)
128
+
129
+ x = self.layer1(x)
130
+ x = self.layer2(x)
131
+ x = self.layer3(x)
132
+ x = self.layer4(x)
133
+
134
+ x = self.avgpool(x)
135
+ x = x.view(x.size(0), -1)
136
+ x = self.fc(x)
137
+
138
+ return x
139
+
140
+
141
+ def resnext101(pretrained=False, **kwargs):
142
+ """Constructs a ResNet-101 model.
143
+ Args:
144
+ pretrained (bool): If True, returns a model pre-trained on Places
145
+ """
146
+ model = ResNeXt(GroupBottleneck, [3, 4, 23, 3], **kwargs)
147
+ if pretrained:
148
+ model.load_state_dict(load_url(model_urls['resnext101']), strict=False)
149
+ return model
preprocess/humanparsing/networks/context_encoding/aspp.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : aspp.py
8
+ @Time : 8/4/19 3:36 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import torch
15
+ import torch.nn as nn
16
+ from torch.nn import functional as F
17
+
18
+ from modules import InPlaceABNSync
19
+
20
+
21
+ class ASPPModule(nn.Module):
22
+ """
23
+ Reference:
24
+ Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
25
+ """
26
+ def __init__(self, features, out_features=512, inner_features=256, dilations=(12, 24, 36)):
27
+ super(ASPPModule, self).__init__()
28
+
29
+ self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
30
+ nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
31
+ bias=False),
32
+ InPlaceABNSync(inner_features))
33
+ self.conv2 = nn.Sequential(
34
+ nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
35
+ InPlaceABNSync(inner_features))
36
+ self.conv3 = nn.Sequential(
37
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
38
+ InPlaceABNSync(inner_features))
39
+ self.conv4 = nn.Sequential(
40
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
41
+ InPlaceABNSync(inner_features))
42
+ self.conv5 = nn.Sequential(
43
+ nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
44
+ InPlaceABNSync(inner_features))
45
+
46
+ self.bottleneck = nn.Sequential(
47
+ nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
48
+ InPlaceABNSync(out_features),
49
+ nn.Dropout2d(0.1)
50
+ )
51
+
52
+ def forward(self, x):
53
+ _, _, h, w = x.size()
54
+
55
+ feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
56
+
57
+ feat2 = self.conv2(x)
58
+ feat3 = self.conv3(x)
59
+ feat4 = self.conv4(x)
60
+ feat5 = self.conv5(x)
61
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
62
+
63
+ bottle = self.bottleneck(out)
64
+ return bottle
preprocess/humanparsing/networks/context_encoding/ocnet.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : ocnet.py
8
+ @Time : 8/4/19 3:36 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import functools
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+ from torch.autograd import Variable
19
+ from torch.nn import functional as F
20
+
21
+ from modules import InPlaceABNSync
22
+ BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
23
+
24
+
25
+ class _SelfAttentionBlock(nn.Module):
26
+ '''
27
+ The basic implementation for self-attention block/non-local block
28
+ Input:
29
+ N X C X H X W
30
+ Parameters:
31
+ in_channels : the dimension of the input feature map
32
+ key_channels : the dimension after the key/query transform
33
+ value_channels : the dimension after the value transform
34
+ scale : choose the scale to downsample the input feature maps (save memory cost)
35
+ Return:
36
+ N X C X H X W
37
+ position-aware context features.(w/o concate or add with the input)
38
+ '''
39
+
40
+ def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
41
+ super(_SelfAttentionBlock, self).__init__()
42
+ self.scale = scale
43
+ self.in_channels = in_channels
44
+ self.out_channels = out_channels
45
+ self.key_channels = key_channels
46
+ self.value_channels = value_channels
47
+ if out_channels == None:
48
+ self.out_channels = in_channels
49
+ self.pool = nn.MaxPool2d(kernel_size=(scale, scale))
50
+ self.f_key = nn.Sequential(
51
+ nn.Conv2d(in_channels=self.in_channels, out_channels=self.key_channels,
52
+ kernel_size=1, stride=1, padding=0),
53
+ InPlaceABNSync(self.key_channels),
54
+ )
55
+ self.f_query = self.f_key
56
+ self.f_value = nn.Conv2d(in_channels=self.in_channels, out_channels=self.value_channels,
57
+ kernel_size=1, stride=1, padding=0)
58
+ self.W = nn.Conv2d(in_channels=self.value_channels, out_channels=self.out_channels,
59
+ kernel_size=1, stride=1, padding=0)
60
+ nn.init.constant(self.W.weight, 0)
61
+ nn.init.constant(self.W.bias, 0)
62
+
63
+ def forward(self, x):
64
+ batch_size, h, w = x.size(0), x.size(2), x.size(3)
65
+ if self.scale > 1:
66
+ x = self.pool(x)
67
+
68
+ value = self.f_value(x).view(batch_size, self.value_channels, -1)
69
+ value = value.permute(0, 2, 1)
70
+ query = self.f_query(x).view(batch_size, self.key_channels, -1)
71
+ query = query.permute(0, 2, 1)
72
+ key = self.f_key(x).view(batch_size, self.key_channels, -1)
73
+
74
+ sim_map = torch.matmul(query, key)
75
+ sim_map = (self.key_channels ** -.5) * sim_map
76
+ sim_map = F.softmax(sim_map, dim=-1)
77
+
78
+ context = torch.matmul(sim_map, value)
79
+ context = context.permute(0, 2, 1).contiguous()
80
+ context = context.view(batch_size, self.value_channels, *x.size()[2:])
81
+ context = self.W(context)
82
+ if self.scale > 1:
83
+ context = F.upsample(input=context, size=(h, w), mode='bilinear', align_corners=True)
84
+ return context
85
+
86
+
87
+ class SelfAttentionBlock2D(_SelfAttentionBlock):
88
+ def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
89
+ super(SelfAttentionBlock2D, self).__init__(in_channels,
90
+ key_channels,
91
+ value_channels,
92
+ out_channels,
93
+ scale)
94
+
95
+
96
+ class BaseOC_Module(nn.Module):
97
+ """
98
+ Implementation of the BaseOC module
99
+ Parameters:
100
+ in_features / out_features: the channels of the input / output feature maps.
101
+ dropout: we choose 0.05 as the default value.
102
+ size: you can apply multiple sizes. Here we only use one size.
103
+ Return:
104
+ features fused with Object context information.
105
+ """
106
+
107
+ def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
108
+ super(BaseOC_Module, self).__init__()
109
+ self.stages = []
110
+ self.stages = nn.ModuleList(
111
+ [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
112
+ self.conv_bn_dropout = nn.Sequential(
113
+ nn.Conv2d(2 * in_channels, out_channels, kernel_size=1, padding=0),
114
+ InPlaceABNSync(out_channels),
115
+ nn.Dropout2d(dropout)
116
+ )
117
+
118
+ def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
119
+ return SelfAttentionBlock2D(in_channels,
120
+ key_channels,
121
+ value_channels,
122
+ output_channels,
123
+ size)
124
+
125
+ def forward(self, feats):
126
+ priors = [stage(feats) for stage in self.stages]
127
+ context = priors[0]
128
+ for i in range(1, len(priors)):
129
+ context += priors[i]
130
+ output = self.conv_bn_dropout(torch.cat([context, feats], 1))
131
+ return output
132
+
133
+
134
+ class BaseOC_Context_Module(nn.Module):
135
+ """
136
+ Output only the context features.
137
+ Parameters:
138
+ in_features / out_features: the channels of the input / output feature maps.
139
+ dropout: specify the dropout ratio
140
+ fusion: We provide two different fusion method, "concat" or "add"
141
+ size: we find that directly learn the attention weights on even 1/8 feature maps is hard.
142
+ Return:
143
+ features after "concat" or "add"
144
+ """
145
+
146
+ def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
147
+ super(BaseOC_Context_Module, self).__init__()
148
+ self.stages = []
149
+ self.stages = nn.ModuleList(
150
+ [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
151
+ self.conv_bn_dropout = nn.Sequential(
152
+ nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0),
153
+ InPlaceABNSync(out_channels),
154
+ )
155
+
156
+ def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
157
+ return SelfAttentionBlock2D(in_channels,
158
+ key_channels,
159
+ value_channels,
160
+ output_channels,
161
+ size)
162
+
163
+ def forward(self, feats):
164
+ priors = [stage(feats) for stage in self.stages]
165
+ context = priors[0]
166
+ for i in range(1, len(priors)):
167
+ context += priors[i]
168
+ output = self.conv_bn_dropout(context)
169
+ return output
170
+
171
+
172
+ class ASP_OC_Module(nn.Module):
173
+ def __init__(self, features, out_features=256, dilations=(12, 24, 36)):
174
+ super(ASP_OC_Module, self).__init__()
175
+ self.context = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=3, padding=1, dilation=1, bias=True),
176
+ InPlaceABNSync(out_features),
177
+ BaseOC_Context_Module(in_channels=out_features, out_channels=out_features,
178
+ key_channels=out_features // 2, value_channels=out_features,
179
+ dropout=0, sizes=([2])))
180
+ self.conv2 = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
181
+ InPlaceABNSync(out_features))
182
+ self.conv3 = nn.Sequential(
183
+ nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
184
+ InPlaceABNSync(out_features))
185
+ self.conv4 = nn.Sequential(
186
+ nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
187
+ InPlaceABNSync(out_features))
188
+ self.conv5 = nn.Sequential(
189
+ nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
190
+ InPlaceABNSync(out_features))
191
+
192
+ self.conv_bn_dropout = nn.Sequential(
193
+ nn.Conv2d(out_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
194
+ InPlaceABNSync(out_features),
195
+ nn.Dropout2d(0.1)
196
+ )
197
+
198
+ def _cat_each(self, feat1, feat2, feat3, feat4, feat5):
199
+ assert (len(feat1) == len(feat2))
200
+ z = []
201
+ for i in range(len(feat1)):
202
+ z.append(torch.cat((feat1[i], feat2[i], feat3[i], feat4[i], feat5[i]), 1))
203
+ return z
204
+
205
+ def forward(self, x):
206
+ if isinstance(x, Variable):
207
+ _, _, h, w = x.size()
208
+ elif isinstance(x, tuple) or isinstance(x, list):
209
+ _, _, h, w = x[0].size()
210
+ else:
211
+ raise RuntimeError('unknown input type')
212
+
213
+ feat1 = self.context(x)
214
+ feat2 = self.conv2(x)
215
+ feat3 = self.conv3(x)
216
+ feat4 = self.conv4(x)
217
+ feat5 = self.conv5(x)
218
+
219
+ if isinstance(x, Variable):
220
+ out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
221
+ elif isinstance(x, tuple) or isinstance(x, list):
222
+ out = self._cat_each(feat1, feat2, feat3, feat4, feat5)
223
+ else:
224
+ raise RuntimeError('unknown input type')
225
+ output = self.conv_bn_dropout(out)
226
+ return output
preprocess/humanparsing/networks/context_encoding/psp.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : psp.py
8
+ @Time : 8/4/19 3:36 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import torch
15
+ import torch.nn as nn
16
+ from torch.nn import functional as F
17
+
18
+ from modules import InPlaceABNSync
19
+
20
+
21
+ class PSPModule(nn.Module):
22
+ """
23
+ Reference:
24
+ Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
25
+ """
26
+ def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
27
+ super(PSPModule, self).__init__()
28
+
29
+ self.stages = []
30
+ self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
31
+ self.bottleneck = nn.Sequential(
32
+ nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
33
+ bias=False),
34
+ InPlaceABNSync(out_features),
35
+ )
36
+
37
+ def _make_stage(self, features, out_features, size):
38
+ prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
39
+ conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
40
+ bn = InPlaceABNSync(out_features)
41
+ return nn.Sequential(prior, conv, bn)
42
+
43
+ def forward(self, feats):
44
+ h, w = feats.size(2), feats.size(3)
45
+ priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
46
+ self.stages] + [feats]
47
+ bottle = self.bottleneck(torch.cat(priors, 1))
48
+ return bottle
preprocess/humanparsing/parsing_api.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ PROJECT_ROOT = Path(__file__).absolute().parents[0].absolute()
5
+ sys.path.insert(0, str(PROJECT_ROOT))
6
+
7
+ import cv2
8
+ import numpy as np
9
+ import torch
10
+ import torchvision.transforms as transforms
11
+ from datasets.simple_extractor_dataset import SimpleFolderDataset
12
+ from PIL import Image
13
+ from utils.transforms import transform_logits
14
+
15
+
16
+ def get_palette(num_cls):
17
+ """ Returns the color map for visualizing the segmentation mask.
18
+ Args:
19
+ num_cls: Number of classes
20
+ Returns:
21
+ The color map
22
+ """
23
+ n = num_cls
24
+ palette = [0] * (n * 3)
25
+ for j in range(0, n):
26
+ lab = j
27
+ palette[j * 3 + 0] = 0
28
+ palette[j * 3 + 1] = 0
29
+ palette[j * 3 + 2] = 0
30
+ i = 0
31
+ while lab:
32
+ palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
33
+ palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
34
+ palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
35
+ i += 1
36
+ lab >>= 3
37
+ return palette
38
+
39
+
40
+ def delete_irregular(logits_result):
41
+ parsing_result = np.argmax(logits_result, axis=2)
42
+ upper_cloth = np.where(parsing_result == 4, 255, 0)
43
+ contours, hierarchy = cv2.findContours(upper_cloth.astype(np.uint8),
44
+ cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
45
+ area = []
46
+ for i in range(len(contours)):
47
+ a = cv2.contourArea(contours[i], True)
48
+ area.append(abs(a))
49
+ if len(area) != 0:
50
+ top = area.index(max(area))
51
+ M = cv2.moments(contours[top])
52
+ cY = int(M["m01"] / M["m00"])
53
+
54
+ dresses = np.where(parsing_result == 7, 255, 0)
55
+ contours_dress, hierarchy_dress = cv2.findContours(dresses.astype(np.uint8),
56
+ cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
57
+ area_dress = []
58
+ for j in range(len(contours_dress)):
59
+ a_d = cv2.contourArea(contours_dress[j], True)
60
+ area_dress.append(abs(a_d))
61
+ if len(area_dress) != 0:
62
+ top_dress = area_dress.index(max(area_dress))
63
+ M_dress = cv2.moments(contours_dress[top_dress])
64
+ cY_dress = int(M_dress["m01"] / M_dress["m00"])
65
+ wear_type = "dresses"
66
+ if len(area) != 0:
67
+ if len(area_dress) != 0 and cY_dress > cY:
68
+ irregular_list = np.array([4, 5, 6])
69
+ logits_result[:, :, irregular_list] = -1
70
+ else:
71
+ irregular_list = np.array([5, 6, 7, 8, 9, 10, 12, 13])
72
+ logits_result[:cY, :, irregular_list] = -1
73
+ wear_type = "cloth_pant"
74
+ parsing_result = np.argmax(logits_result, axis=2)
75
+ # pad border
76
+ parsing_result = np.pad(parsing_result, pad_width=1, mode='constant', constant_values=0)
77
+ return parsing_result, wear_type
78
+
79
+
80
+ def hole_fill(img):
81
+ img_copy = img.copy()
82
+ mask = np.zeros((img.shape[0] + 2, img.shape[1] + 2), dtype=np.uint8)
83
+ cv2.floodFill(img, mask, (0, 0), 255)
84
+ img_inverse = cv2.bitwise_not(img)
85
+ dst = cv2.bitwise_or(img_copy, img_inverse)
86
+ return dst
87
+
88
+
89
+ def refine_mask(mask):
90
+ contours, hierarchy = cv2.findContours(mask.astype(np.uint8),
91
+ cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
92
+ area = []
93
+ for j in range(len(contours)):
94
+ a_d = cv2.contourArea(contours[j], True)
95
+ area.append(abs(a_d))
96
+ refine_mask = np.zeros_like(mask).astype(np.uint8)
97
+ if len(area) != 0:
98
+ i = area.index(max(area))
99
+ cv2.drawContours(refine_mask, contours, i, color=255, thickness=-1)
100
+ # keep large area in skin case
101
+ for j in range(len(area)):
102
+ if j != i and area[i] > 2000:
103
+ cv2.drawContours(refine_mask, contours, j, color=255, thickness=-1)
104
+ return refine_mask
105
+
106
+
107
+ def refine_hole(parsing_result_filled, parsing_result, arm_mask):
108
+ filled_hole = cv2.bitwise_and(np.where(parsing_result_filled == 4, 255, 0),
109
+ np.where(parsing_result != 4, 255, 0)) - arm_mask * 255
110
+ contours, hierarchy = cv2.findContours(filled_hole, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
111
+ refine_hole_mask = np.zeros_like(parsing_result).astype(np.uint8)
112
+ for i in range(len(contours)):
113
+ a = cv2.contourArea(contours[i], True)
114
+ # keep hole > 2000 pixels
115
+ if abs(a) > 2000:
116
+ cv2.drawContours(refine_hole_mask, contours, i, color=255, thickness=-1)
117
+ return refine_hole_mask + arm_mask
118
+
119
+
120
+ def onnx_inference(session, lip_session, input_dir):
121
+ transform = transforms.Compose([
122
+ transforms.ToTensor(),
123
+ transforms.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])
124
+ ])
125
+ dataset = SimpleFolderDataset(root=input_dir, input_size=[512, 512], transform=transform)
126
+ # dataloader = DataLoader(dataset)
127
+ with torch.no_grad():
128
+ # for _, batch in enumerate(tqdm(dataloader, disable=True)):
129
+ image, meta = dataset[0]
130
+ image = image.unsqueeze(0)
131
+
132
+ # image, meta = batch
133
+ c = meta['center']
134
+ h = meta['height']
135
+ w = meta['width']
136
+ s = meta['scale']
137
+ output = session.run(None, {"input.1": image.numpy().astype(np.float32)})
138
+ upsample = torch.nn.Upsample(size=[512, 512], mode='bilinear', align_corners=True)
139
+ upsample_output = upsample(torch.from_numpy(output[1][0]).unsqueeze(0))
140
+ upsample_output = upsample_output.squeeze()
141
+ upsample_output = upsample_output.permute(1, 2, 0) # CHW -> HWC
142
+ logits_result = transform_logits(upsample_output.data.cpu().numpy(), c, s, w, h, input_size=[512, 512])
143
+ parsing_result = np.argmax(logits_result, axis=2)
144
+ parsing_result = np.pad(parsing_result, pad_width=1, mode='constant', constant_values=0)
145
+ # try holefilling the clothes part
146
+ arm_mask = (parsing_result == 14).astype(np.float32) \
147
+ + (parsing_result == 15).astype(np.float32)
148
+ upper_cloth_mask = (parsing_result == 4).astype(np.float32) + arm_mask
149
+ img = np.where(upper_cloth_mask, 255, 0)
150
+ dst = hole_fill(img.astype(np.uint8))
151
+ parsing_result_filled = dst / 255 * 4
152
+ parsing_result_woarm = np.where(parsing_result_filled == 4, parsing_result_filled, parsing_result)
153
+ # add back arm and refined hole between arm and cloth
154
+ refine_hole_mask = refine_hole(parsing_result_filled.astype(np.uint8), parsing_result.astype(np.uint8),
155
+ arm_mask.astype(np.uint8))
156
+ parsing_result = np.where(refine_hole_mask, parsing_result, parsing_result_woarm)
157
+ # remove padding
158
+ parsing_result = parsing_result[1:-1, 1:-1]
159
+
160
+ dataset_lip = SimpleFolderDataset(root=input_dir, input_size=[473, 473], transform=transform)
161
+ # dataloader_lip = DataLoader(dataset_lip)
162
+ with torch.no_grad():
163
+ # for _, batch in enumerate(tqdm(dataloader_lip, disable=True)):
164
+
165
+ image, meta = dataset_lip[0]
166
+ image = image.unsqueeze(0)
167
+
168
+ # image, meta = batch
169
+ c = meta['center']
170
+ s = meta['scale']
171
+ w = meta['width']
172
+ h = meta['height']
173
+
174
+ output_lip = lip_session.run(None, {"input.1": image.numpy().astype(np.float32)})
175
+ upsample = torch.nn.Upsample(size=[473, 473], mode='bilinear', align_corners=True)
176
+ upsample_output_lip = upsample(torch.from_numpy(output_lip[1][0]).unsqueeze(0))
177
+ upsample_output_lip = upsample_output_lip.squeeze()
178
+ upsample_output_lip = upsample_output_lip.permute(1, 2, 0) # CHW -> HWC
179
+ logits_result_lip = transform_logits(upsample_output_lip.data.cpu().numpy(), c, s, w, h,
180
+ input_size=[473, 473])
181
+ parsing_result_lip = np.argmax(logits_result_lip, axis=2)
182
+ # add neck parsing result
183
+ neck_mask = np.logical_and(np.logical_not((parsing_result_lip == 13).astype(np.float32)),
184
+ (parsing_result == 11).astype(np.float32))
185
+ parsing_result = np.where(neck_mask, 18, parsing_result)
186
+ palette = get_palette(19)
187
+ output_img = Image.fromarray(np.asarray(parsing_result, dtype=np.uint8))
188
+ output_img.putpalette(palette)
189
+ face_mask = torch.from_numpy((parsing_result == 11).astype(np.float32))
190
+
191
+ return output_img, face_mask
preprocess/humanparsing/run_parsing.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pdb
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ import onnxruntime as ort
7
+
8
+ PROJECT_ROOT = Path(__file__).absolute().parents[0].absolute()
9
+ sys.path.insert(0, str(PROJECT_ROOT))
10
+ import torch
11
+ from parsing_api import onnx_inference
12
+
13
+
14
+ class Parsing:
15
+ def __init__(self, gpu_id: int):
16
+ self.gpu_id = gpu_id
17
+ # torch.cuda.set_device(gpu_id)
18
+ session_options = ort.SessionOptions()
19
+ session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
20
+ session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
21
+ #### jho modified >>>>
22
+ providers = [
23
+ ('CUDAExecutionProvider', {
24
+ 'device_id': gpu_id,
25
+ }),
26
+ 'CPUExecutionProvider',
27
+ ]
28
+ self.session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'checkpoints/humanparsing/parsing_atr.onnx'),
29
+ sess_options=session_options, providers=providers)
30
+ self.lip_session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'checkpoints/humanparsing/parsing_lip.onnx'),
31
+ sess_options=session_options, providers=providers)
32
+ #### jho modified <<<<
33
+ # session_options.add_session_config_entry('gpu_id', str(gpu_id))
34
+ # self.session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'checkpoints/humanparsing/parsing_atr.onnx'),
35
+ # sess_options=session_options, providers=['CUDAExecutionProvider'])
36
+ # self.lip_session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'checkpoints/humanparsing/parsing_lip.onnx'),
37
+ # sess_options=session_options, providers=['CUDAExecutionProvider'])
38
+ print(f"parsing init done (gpu: {gpu_id})")
39
+
40
+ def __call__(self, input_image):
41
+ torch.cuda.set_device(self.gpu_id)
42
+ parsed_image, face_mask = onnx_inference(self.session, self.lip_session, input_image)
43
+ return parsed_image, face_mask
44
+
preprocess/humanparsing/utils/__init__.py ADDED
File without changes
preprocess/humanparsing/utils/consistency_loss.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : kl_loss.py
8
+ @Time : 7/23/19 4:02 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+ import torch
14
+ import torch.nn.functional as F
15
+ from torch import nn
16
+ from datasets.target_generation import generate_edge_tensor
17
+
18
+
19
+ class ConsistencyLoss(nn.Module):
20
+ def __init__(self, ignore_index=255):
21
+ super(ConsistencyLoss, self).__init__()
22
+ self.ignore_index=ignore_index
23
+
24
+ def forward(self, parsing, edge, label):
25
+ parsing_pre = torch.argmax(parsing, dim=1)
26
+ parsing_pre[label==self.ignore_index]=self.ignore_index
27
+ generated_edge = generate_edge_tensor(parsing_pre)
28
+ edge_pre = torch.argmax(edge, dim=1)
29
+ v_generate_edge = generated_edge[label!=255]
30
+ v_edge_pre = edge_pre[label!=255]
31
+ v_edge_pre = v_edge_pre.type(torch.cuda.FloatTensor)
32
+ positive_union = (v_generate_edge==1)&(v_edge_pre==1) # only the positive values count
33
+ return F.smooth_l1_loss(v_generate_edge[positive_union].squeeze(0), v_edge_pre[positive_union].squeeze(0))
preprocess/humanparsing/utils/criterion.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : criterion.py
8
+ @Time : 8/30/19 8:59 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import torch.nn as nn
15
+ import torch
16
+ import numpy as np
17
+ from torch.nn import functional as F
18
+ from .lovasz_softmax import LovaszSoftmax
19
+ from .kl_loss import KLDivergenceLoss
20
+ from .consistency_loss import ConsistencyLoss
21
+
22
+ NUM_CLASSES = 20
23
+
24
+
25
+ class CriterionAll(nn.Module):
26
+ def __init__(self, use_class_weight=False, ignore_index=255, lambda_1=1, lambda_2=1, lambda_3=1,
27
+ num_classes=20):
28
+ super(CriterionAll, self).__init__()
29
+ self.ignore_index = ignore_index
30
+ self.use_class_weight = use_class_weight
31
+ self.criterion = torch.nn.CrossEntropyLoss(ignore_index=ignore_index)
32
+ self.lovasz = LovaszSoftmax(ignore_index=ignore_index)
33
+ self.kldiv = KLDivergenceLoss(ignore_index=ignore_index)
34
+ self.reg = ConsistencyLoss(ignore_index=ignore_index)
35
+ self.lamda_1 = lambda_1
36
+ self.lamda_2 = lambda_2
37
+ self.lamda_3 = lambda_3
38
+ self.num_classes = num_classes
39
+
40
+ def parsing_loss(self, preds, target, cycle_n=None):
41
+ """
42
+ Loss function definition.
43
+
44
+ Args:
45
+ preds: [[parsing result1, parsing result2],[edge result]]
46
+ target: [parsing label, egde label]
47
+ soft_preds: [[parsing result1, parsing result2],[edge result]]
48
+ Returns:
49
+ Calculated Loss.
50
+ """
51
+ h, w = target[0].size(1), target[0].size(2)
52
+
53
+ pos_num = torch.sum(target[1] == 1, dtype=torch.float)
54
+ neg_num = torch.sum(target[1] == 0, dtype=torch.float)
55
+
56
+ weight_pos = neg_num / (pos_num + neg_num)
57
+ weight_neg = pos_num / (pos_num + neg_num)
58
+ weights = torch.tensor([weight_neg, weight_pos]) # edge loss weight
59
+
60
+ loss = 0
61
+
62
+ # loss for segmentation
63
+ preds_parsing = preds[0]
64
+ for pred_parsing in preds_parsing:
65
+ scale_pred = F.interpolate(input=pred_parsing, size=(h, w),
66
+ mode='bilinear', align_corners=True)
67
+
68
+ loss += 0.5 * self.lamda_1 * self.lovasz(scale_pred, target[0])
69
+ if target[2] is None:
70
+ loss += 0.5 * self.lamda_1 * self.criterion(scale_pred, target[0])
71
+ else:
72
+ soft_scale_pred = F.interpolate(input=target[2], size=(h, w),
73
+ mode='bilinear', align_corners=True)
74
+ soft_scale_pred = moving_average(soft_scale_pred, to_one_hot(target[0], num_cls=self.num_classes),
75
+ 1.0 / (cycle_n + 1.0))
76
+ loss += 0.5 * self.lamda_1 * self.kldiv(scale_pred, soft_scale_pred, target[0])
77
+
78
+ # loss for edge
79
+ preds_edge = preds[1]
80
+ for pred_edge in preds_edge:
81
+ scale_pred = F.interpolate(input=pred_edge, size=(h, w),
82
+ mode='bilinear', align_corners=True)
83
+ if target[3] is None:
84
+ loss += self.lamda_2 * F.cross_entropy(scale_pred, target[1],
85
+ weights.cuda(), ignore_index=self.ignore_index)
86
+ else:
87
+ soft_scale_edge = F.interpolate(input=target[3], size=(h, w),
88
+ mode='bilinear', align_corners=True)
89
+ soft_scale_edge = moving_average(soft_scale_edge, to_one_hot(target[1], num_cls=2),
90
+ 1.0 / (cycle_n + 1.0))
91
+ loss += self.lamda_2 * self.kldiv(scale_pred, soft_scale_edge, target[0])
92
+
93
+ # consistency regularization
94
+ preds_parsing = preds[0]
95
+ preds_edge = preds[1]
96
+ for pred_parsing in preds_parsing:
97
+ scale_pred = F.interpolate(input=pred_parsing, size=(h, w),
98
+ mode='bilinear', align_corners=True)
99
+ scale_edge = F.interpolate(input=preds_edge[0], size=(h, w),
100
+ mode='bilinear', align_corners=True)
101
+ loss += self.lamda_3 * self.reg(scale_pred, scale_edge, target[0])
102
+
103
+ return loss
104
+
105
+ def forward(self, preds, target, cycle_n=None):
106
+ loss = self.parsing_loss(preds, target, cycle_n)
107
+ return loss
108
+
109
+ def _generate_weights(self, masks, num_classes):
110
+ """
111
+ masks: torch.Tensor with shape [B, H, W]
112
+ """
113
+ masks_label = masks.data.cpu().numpy().astype(np.int64)
114
+ pixel_nums = []
115
+ tot_pixels = 0
116
+ for i in range(num_classes):
117
+ pixel_num_of_cls_i = np.sum(masks_label == i).astype(np.float)
118
+ pixel_nums.append(pixel_num_of_cls_i)
119
+ tot_pixels += pixel_num_of_cls_i
120
+ weights = []
121
+ for i in range(num_classes):
122
+ weights.append(
123
+ (tot_pixels - pixel_nums[i]) / tot_pixels / (num_classes - 1)
124
+ )
125
+ weights = np.array(weights, dtype=np.float)
126
+ # weights = torch.from_numpy(weights).float().to(masks.device)
127
+ return weights
128
+
129
+
130
+ def moving_average(target1, target2, alpha=1.0):
131
+ target = 0
132
+ target += (1.0 - alpha) * target1
133
+ target += target2 * alpha
134
+ return target
135
+
136
+
137
+ def to_one_hot(tensor, num_cls, dim=1, ignore_index=255):
138
+ b, h, w = tensor.shape
139
+ tensor[tensor == ignore_index] = 0
140
+ onehot_tensor = torch.zeros(b, num_cls, h, w).cuda()
141
+ onehot_tensor.scatter_(dim, tensor.unsqueeze(dim), 1)
142
+ return onehot_tensor
preprocess/humanparsing/utils/encoding.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2
+ ## Created by: Hang Zhang
3
+ ## ECE Department, Rutgers University
4
+ ## Email: zhang.hang@rutgers.edu
5
+ ## Copyright (c) 2017
6
+ ##
7
+ ## This source code is licensed under the MIT-style license found in the
8
+ ## LICENSE file in the root directory of this source tree
9
+ ##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
10
+
11
+ """Encoding Data Parallel"""
12
+ import threading
13
+ import torch
14
+ from torch.autograd import Variable, Function
15
+ import torch.cuda.comm as comm
16
+ from torch.nn.parallel.data_parallel import DataParallel
17
+ from torch.nn.parallel.parallel_apply import get_a_var
18
+ from torch.nn.parallel._functions import Broadcast
19
+
20
+ torch_ver = torch.__version__[:3]
21
+
22
+ __all__ = ['allreduce', 'DataParallelModel', 'DataParallelCriterion', 'patch_replication_callback']
23
+
24
+ def allreduce(*inputs):
25
+ """Cross GPU all reduce autograd operation for calculate mean and
26
+ variance in SyncBN.
27
+ """
28
+ return AllReduce.apply(*inputs)
29
+
30
+ class AllReduce(Function):
31
+ @staticmethod
32
+ def forward(ctx, num_inputs, *inputs):
33
+ ctx.num_inputs = num_inputs
34
+ ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
35
+ inputs = [inputs[i:i + num_inputs]
36
+ for i in range(0, len(inputs), num_inputs)]
37
+ # sort before reduce sum
38
+ inputs = sorted(inputs, key=lambda i: i[0].get_device())
39
+ results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
40
+ outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
41
+ return tuple([t for tensors in outputs for t in tensors])
42
+
43
+ @staticmethod
44
+ def backward(ctx, *inputs):
45
+ inputs = [i.data for i in inputs]
46
+ inputs = [inputs[i:i + ctx.num_inputs]
47
+ for i in range(0, len(inputs), ctx.num_inputs)]
48
+ results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
49
+ outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
50
+ return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
51
+
52
+ class Reduce(Function):
53
+ @staticmethod
54
+ def forward(ctx, *inputs):
55
+ ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
56
+ inputs = sorted(inputs, key=lambda i: i.get_device())
57
+ return comm.reduce_add(inputs)
58
+
59
+ @staticmethod
60
+ def backward(ctx, gradOutput):
61
+ return Broadcast.apply(ctx.target_gpus, gradOutput)
62
+
63
+
64
+ class DataParallelModel(DataParallel):
65
+ """Implements data parallelism at the module level.
66
+
67
+ This container parallelizes the application of the given module by
68
+ splitting the input across the specified devices by chunking in the
69
+ batch dimension.
70
+ In the forward pass, the module is replicated on each device,
71
+ and each replica handles a portion of the input. During the backwards pass, gradients from each replica are summed into the original module.
72
+ Note that the outputs are not gathered, please use compatible
73
+ :class:`encoding.parallel.DataParallelCriterion`.
74
+
75
+ The batch size should be larger than the number of GPUs used. It should
76
+ also be an integer multiple of the number of GPUs so that each chunk is
77
+ the same size (so that each GPU processes the same number of samples).
78
+
79
+ Args:
80
+ module: module to be parallelized
81
+ device_ids: CUDA devices (default: all devices)
82
+
83
+ Reference:
84
+ Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
85
+ Amit Agrawal. “Context Encoding for Semantic Segmentation.
86
+ *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
87
+
88
+ Example::
89
+
90
+ >>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
91
+ >>> y = net(x)
92
+ """
93
+ def gather(self, outputs, output_device):
94
+ return outputs
95
+
96
+ def replicate(self, module, device_ids):
97
+ modules = super(DataParallelModel, self).replicate(module, device_ids)
98
+ return modules
99
+
100
+
101
+ class DataParallelCriterion(DataParallel):
102
+ """
103
+ Calculate loss in multiple-GPUs, which balance the memory usage for
104
+ Semantic Segmentation.
105
+
106
+ The targets are splitted across the specified devices by chunking in
107
+ the batch dimension. Please use together with :class:`encoding.parallel.DataParallelModel`.
108
+
109
+ Reference:
110
+ Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
111
+ Amit Agrawal. “Context Encoding for Semantic Segmentation.
112
+ *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
113
+
114
+ Example::
115
+
116
+ >>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
117
+ >>> criterion = encoding.nn.DataParallelCriterion(criterion, device_ids=[0, 1, 2])
118
+ >>> y = net(x)
119
+ >>> loss = criterion(y, target)
120
+ """
121
+ def forward(self, inputs, *targets, **kwargs):
122
+ # input should be already scatterd
123
+ # scattering the targets instead
124
+ if not self.device_ids:
125
+ return self.module(inputs, *targets, **kwargs)
126
+ targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
127
+ if len(self.device_ids) == 1:
128
+ return self.module(inputs, *targets[0], **kwargs[0])
129
+ replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
130
+ outputs = _criterion_parallel_apply(replicas, inputs, targets, kwargs)
131
+ return Reduce.apply(*outputs) / len(outputs)
132
+
133
+
134
+ def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices=None):
135
+ assert len(modules) == len(inputs)
136
+ assert len(targets) == len(inputs)
137
+ if kwargs_tup:
138
+ assert len(modules) == len(kwargs_tup)
139
+ else:
140
+ kwargs_tup = ({},) * len(modules)
141
+ if devices is not None:
142
+ assert len(modules) == len(devices)
143
+ else:
144
+ devices = [None] * len(modules)
145
+
146
+ lock = threading.Lock()
147
+ results = {}
148
+ if torch_ver != "0.3":
149
+ grad_enabled = torch.is_grad_enabled()
150
+
151
+ def _worker(i, module, input, target, kwargs, device=None):
152
+ if torch_ver != "0.3":
153
+ torch.set_grad_enabled(grad_enabled)
154
+ if device is None:
155
+ device = get_a_var(input).get_device()
156
+ try:
157
+ if not isinstance(input, tuple):
158
+ input = (input,)
159
+ with torch.cuda.device(device):
160
+ output = module(*(input + target), **kwargs)
161
+ with lock:
162
+ results[i] = output
163
+ except Exception as e:
164
+ with lock:
165
+ results[i] = e
166
+
167
+ if len(modules) > 1:
168
+ threads = [threading.Thread(target=_worker,
169
+ args=(i, module, input, target,
170
+ kwargs, device),)
171
+ for i, (module, input, target, kwargs, device) in
172
+ enumerate(zip(modules, inputs, targets, kwargs_tup, devices))]
173
+
174
+ for thread in threads:
175
+ thread.start()
176
+ for thread in threads:
177
+ thread.join()
178
+ else:
179
+ _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
180
+
181
+ outputs = []
182
+ for i in range(len(inputs)):
183
+ output = results[i]
184
+ if isinstance(output, Exception):
185
+ raise output
186
+ outputs.append(output)
187
+ return outputs
preprocess/humanparsing/utils/kl_loss.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : kl_loss.py
8
+ @Time : 7/23/19 4:02 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+ import torch.nn.functional as F
14
+ from torch import nn
15
+
16
+
17
+ def flatten_probas(input, target, labels, ignore=255):
18
+ """
19
+ Flattens predictions in the batch.
20
+ """
21
+ B, C, H, W = input.size()
22
+ input = input.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C
23
+ target = target.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C
24
+ labels = labels.view(-1)
25
+ if ignore is None:
26
+ return input, target
27
+ valid = (labels != ignore)
28
+ vinput = input[valid.nonzero().squeeze()]
29
+ vtarget = target[valid.nonzero().squeeze()]
30
+ return vinput, vtarget
31
+
32
+
33
+ class KLDivergenceLoss(nn.Module):
34
+ def __init__(self, ignore_index=255, T=1):
35
+ super(KLDivergenceLoss, self).__init__()
36
+ self.ignore_index=ignore_index
37
+ self.T = T
38
+
39
+ def forward(self, input, target, label):
40
+ log_input_prob = F.log_softmax(input / self.T, dim=1)
41
+ target_porb = F.softmax(target / self.T, dim=1)
42
+ loss = F.kl_div(*flatten_probas(log_input_prob, target_porb, label, ignore=self.ignore_index))
43
+ return self.T*self.T*loss # balanced
preprocess/humanparsing/utils/lovasz_softmax.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : lovasz_softmax.py
8
+ @Time : 8/30/19 7:12 PM
9
+ @Desc : Lovasz-Softmax and Jaccard hinge loss in PyTorch
10
+ Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License)
11
+ @License : This source code is licensed under the license found in the
12
+ LICENSE file in the root directory of this source tree.
13
+ """
14
+
15
+ from __future__ import print_function, division
16
+
17
+ import torch
18
+ from torch.autograd import Variable
19
+ import torch.nn.functional as F
20
+ import numpy as np
21
+ from torch import nn
22
+
23
+ try:
24
+ from itertools import ifilterfalse
25
+ except ImportError: # py3k
26
+ from itertools import filterfalse as ifilterfalse
27
+
28
+
29
+ def lovasz_grad(gt_sorted):
30
+ """
31
+ Computes gradient of the Lovasz extension w.r.t sorted errors
32
+ See Alg. 1 in paper
33
+ """
34
+ p = len(gt_sorted)
35
+ gts = gt_sorted.sum()
36
+ intersection = gts - gt_sorted.float().cumsum(0)
37
+ union = gts + (1 - gt_sorted).float().cumsum(0)
38
+ jaccard = 1. - intersection / union
39
+ if p > 1: # cover 1-pixel case
40
+ jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
41
+ return jaccard
42
+
43
+
44
+ def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True):
45
+ """
46
+ IoU for foreground class
47
+ binary: 1 foreground, 0 background
48
+ """
49
+ if not per_image:
50
+ preds, labels = (preds,), (labels,)
51
+ ious = []
52
+ for pred, label in zip(preds, labels):
53
+ intersection = ((label == 1) & (pred == 1)).sum()
54
+ union = ((label == 1) | ((pred == 1) & (label != ignore))).sum()
55
+ if not union:
56
+ iou = EMPTY
57
+ else:
58
+ iou = float(intersection) / float(union)
59
+ ious.append(iou)
60
+ iou = mean(ious) # mean accross images if per_image
61
+ return 100 * iou
62
+
63
+
64
+ def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False):
65
+ """
66
+ Array of IoU for each (non ignored) class
67
+ """
68
+ if not per_image:
69
+ preds, labels = (preds,), (labels,)
70
+ ious = []
71
+ for pred, label in zip(preds, labels):
72
+ iou = []
73
+ for i in range(C):
74
+ if i != ignore: # The ignored label is sometimes among predicted classes (ENet - CityScapes)
75
+ intersection = ((label == i) & (pred == i)).sum()
76
+ union = ((label == i) | ((pred == i) & (label != ignore))).sum()
77
+ if not union:
78
+ iou.append(EMPTY)
79
+ else:
80
+ iou.append(float(intersection) / float(union))
81
+ ious.append(iou)
82
+ ious = [mean(iou) for iou in zip(*ious)] # mean accross images if per_image
83
+ return 100 * np.array(ious)
84
+
85
+
86
+ # --------------------------- BINARY LOSSES ---------------------------
87
+
88
+
89
+ def lovasz_hinge(logits, labels, per_image=True, ignore=None):
90
+ """
91
+ Binary Lovasz hinge loss
92
+ logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
93
+ labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
94
+ per_image: compute the loss per image instead of per batch
95
+ ignore: void class id
96
+ """
97
+ if per_image:
98
+ loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore))
99
+ for log, lab in zip(logits, labels))
100
+ else:
101
+ loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore))
102
+ return loss
103
+
104
+
105
+ def lovasz_hinge_flat(logits, labels):
106
+ """
107
+ Binary Lovasz hinge loss
108
+ logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
109
+ labels: [P] Tensor, binary ground truth labels (0 or 1)
110
+ ignore: label to ignore
111
+ """
112
+ if len(labels) == 0:
113
+ # only void pixels, the gradients should be 0
114
+ return logits.sum() * 0.
115
+ signs = 2. * labels.float() - 1.
116
+ errors = (1. - logits * Variable(signs))
117
+ errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
118
+ perm = perm.data
119
+ gt_sorted = labels[perm]
120
+ grad = lovasz_grad(gt_sorted)
121
+ loss = torch.dot(F.relu(errors_sorted), Variable(grad))
122
+ return loss
123
+
124
+
125
+ def flatten_binary_scores(scores, labels, ignore=None):
126
+ """
127
+ Flattens predictions in the batch (binary case)
128
+ Remove labels equal to 'ignore'
129
+ """
130
+ scores = scores.view(-1)
131
+ labels = labels.view(-1)
132
+ if ignore is None:
133
+ return scores, labels
134
+ valid = (labels != ignore)
135
+ vscores = scores[valid]
136
+ vlabels = labels[valid]
137
+ return vscores, vlabels
138
+
139
+
140
+ class StableBCELoss(torch.nn.modules.Module):
141
+ def __init__(self):
142
+ super(StableBCELoss, self).__init__()
143
+
144
+ def forward(self, input, target):
145
+ neg_abs = - input.abs()
146
+ loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
147
+ return loss.mean()
148
+
149
+
150
+ def binary_xloss(logits, labels, ignore=None):
151
+ """
152
+ Binary Cross entropy loss
153
+ logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
154
+ labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
155
+ ignore: void class id
156
+ """
157
+ logits, labels = flatten_binary_scores(logits, labels, ignore)
158
+ loss = StableBCELoss()(logits, Variable(labels.float()))
159
+ return loss
160
+
161
+
162
+ # --------------------------- MULTICLASS LOSSES ---------------------------
163
+
164
+
165
+ def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=255, weighted=None):
166
+ """
167
+ Multi-class Lovasz-Softmax loss
168
+ probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
169
+ Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
170
+ labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
171
+ classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
172
+ per_image: compute the loss per image instead of per batch
173
+ ignore: void class labels
174
+ """
175
+ if per_image:
176
+ loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes, weighted=weighted)
177
+ for prob, lab in zip(probas, labels))
178
+ else:
179
+ loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes, weighted=weighted )
180
+ return loss
181
+
182
+
183
+ def lovasz_softmax_flat(probas, labels, classes='present', weighted=None):
184
+ """
185
+ Multi-class Lovasz-Softmax loss
186
+ probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
187
+ labels: [P] Tensor, ground truth labels (between 0 and C - 1)
188
+ classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
189
+ """
190
+ if probas.numel() == 0:
191
+ # only void pixels, the gradients should be 0
192
+ return probas * 0.
193
+ C = probas.size(1)
194
+ losses = []
195
+ class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
196
+ for c in class_to_sum:
197
+ fg = (labels == c).float() # foreground for class c
198
+ if (classes is 'present' and fg.sum() == 0):
199
+ continue
200
+ if C == 1:
201
+ if len(classes) > 1:
202
+ raise ValueError('Sigmoid output possible only with 1 class')
203
+ class_pred = probas[:, 0]
204
+ else:
205
+ class_pred = probas[:, c]
206
+ errors = (Variable(fg) - class_pred).abs()
207
+ errors_sorted, perm = torch.sort(errors, 0, descending=True)
208
+ perm = perm.data
209
+ fg_sorted = fg[perm]
210
+ if weighted is not None:
211
+ losses.append(weighted[c]*torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
212
+ else:
213
+ losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
214
+ return mean(losses)
215
+
216
+
217
+ def flatten_probas(probas, labels, ignore=None):
218
+ """
219
+ Flattens predictions in the batch
220
+ """
221
+ if probas.dim() == 3:
222
+ # assumes output of a sigmoid layer
223
+ B, H, W = probas.size()
224
+ probas = probas.view(B, 1, H, W)
225
+ B, C, H, W = probas.size()
226
+ probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C
227
+ labels = labels.view(-1)
228
+ if ignore is None:
229
+ return probas, labels
230
+ valid = (labels != ignore)
231
+ vprobas = probas[valid.nonzero().squeeze()]
232
+ vlabels = labels[valid]
233
+ return vprobas, vlabels
234
+
235
+
236
+ def xloss(logits, labels, ignore=None):
237
+ """
238
+ Cross entropy loss
239
+ """
240
+ return F.cross_entropy(logits, Variable(labels), ignore_index=255)
241
+
242
+
243
+ # --------------------------- HELPER FUNCTIONS ---------------------------
244
+ def isnan(x):
245
+ return x != x
246
+
247
+
248
+ def mean(l, ignore_nan=False, empty=0):
249
+ """
250
+ nanmean compatible with generators.
251
+ """
252
+ l = iter(l)
253
+ if ignore_nan:
254
+ l = ifilterfalse(isnan, l)
255
+ try:
256
+ n = 1
257
+ acc = next(l)
258
+ except StopIteration:
259
+ if empty == 'raise':
260
+ raise ValueError('Empty mean')
261
+ return empty
262
+ for n, v in enumerate(l, 2):
263
+ acc += v
264
+ if n == 1:
265
+ return acc
266
+ return acc / n
267
+
268
+ # --------------------------- Class ---------------------------
269
+ class LovaszSoftmax(nn.Module):
270
+ def __init__(self, per_image=False, ignore_index=255, weighted=None):
271
+ super(LovaszSoftmax, self).__init__()
272
+ self.lovasz_softmax = lovasz_softmax
273
+ self.per_image = per_image
274
+ self.ignore_index=ignore_index
275
+ self.weighted = weighted
276
+
277
+ def forward(self, pred, label):
278
+ pred = F.softmax(pred, dim=1)
279
+ return self.lovasz_softmax(pred, label, per_image=self.per_image, ignore=self.ignore_index, weighted=self.weighted)
preprocess/humanparsing/utils/miou.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+ import numpy as np
4
+
5
+ from collections import OrderedDict
6
+ from PIL import Image as PILImage
7
+ from utils.transforms import transform_parsing
8
+
9
+ LABELS = ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat', \
10
+ 'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm', 'Left-leg',
11
+ 'Right-leg', 'Left-shoe', 'Right-shoe']
12
+
13
+
14
+ # LABELS = ['Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs']
15
+
16
+ def get_palette(num_cls):
17
+ """ Returns the color map for visualizing the segmentation mask.
18
+ Args:
19
+ num_cls: Number of classes
20
+ Returns:
21
+ The color map
22
+ """
23
+
24
+ n = num_cls
25
+ palette = [0] * (n * 3)
26
+ for j in range(0, n):
27
+ lab = j
28
+ palette[j * 3 + 0] = 0
29
+ palette[j * 3 + 1] = 0
30
+ palette[j * 3 + 2] = 0
31
+ i = 0
32
+ while lab:
33
+ palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
34
+ palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
35
+ palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
36
+ i += 1
37
+ lab >>= 3
38
+ return palette
39
+
40
+
41
+ def get_confusion_matrix(gt_label, pred_label, num_classes):
42
+ """
43
+ Calcute the confusion matrix by given label and pred
44
+ :param gt_label: the ground truth label
45
+ :param pred_label: the pred label
46
+ :param num_classes: the nunber of class
47
+ :return: the confusion matrix
48
+ """
49
+ index = (gt_label * num_classes + pred_label).astype('int32')
50
+ label_count = np.bincount(index)
51
+ confusion_matrix = np.zeros((num_classes, num_classes))
52
+
53
+ for i_label in range(num_classes):
54
+ for i_pred_label in range(num_classes):
55
+ cur_index = i_label * num_classes + i_pred_label
56
+ if cur_index < len(label_count):
57
+ confusion_matrix[i_label, i_pred_label] = label_count[cur_index]
58
+
59
+ return confusion_matrix
60
+
61
+
62
+ def compute_mean_ioU(preds, scales, centers, num_classes, datadir, input_size=[473, 473], dataset='val'):
63
+ val_file = os.path.join(datadir, dataset + '_id.txt')
64
+ val_id = [i_id.strip() for i_id in open(val_file)]
65
+
66
+ confusion_matrix = np.zeros((num_classes, num_classes))
67
+
68
+ for i, pred_out in enumerate(preds):
69
+ im_name = val_id[i]
70
+ gt_path = os.path.join(datadir, dataset + '_segmentations', im_name + '.png')
71
+ gt = np.array(PILImage.open(gt_path))
72
+ h, w = gt.shape
73
+ s = scales[i]
74
+ c = centers[i]
75
+ pred = transform_parsing(pred_out, c, s, w, h, input_size)
76
+
77
+ gt = np.asarray(gt, dtype=np.int32)
78
+ pred = np.asarray(pred, dtype=np.int32)
79
+
80
+ ignore_index = gt != 255
81
+
82
+ gt = gt[ignore_index]
83
+ pred = pred[ignore_index]
84
+
85
+ confusion_matrix += get_confusion_matrix(gt, pred, num_classes)
86
+
87
+ pos = confusion_matrix.sum(1)
88
+ res = confusion_matrix.sum(0)
89
+ tp = np.diag(confusion_matrix)
90
+
91
+ pixel_accuracy = (tp.sum() / pos.sum()) * 100
92
+ mean_accuracy = ((tp / np.maximum(1.0, pos)).mean()) * 100
93
+ IoU_array = (tp / np.maximum(1.0, pos + res - tp))
94
+ IoU_array = IoU_array * 100
95
+ mean_IoU = IoU_array.mean()
96
+ print('Pixel accuracy: %f \n' % pixel_accuracy)
97
+ print('Mean accuracy: %f \n' % mean_accuracy)
98
+ print('Mean IU: %f \n' % mean_IoU)
99
+ name_value = []
100
+
101
+ for i, (label, iou) in enumerate(zip(LABELS, IoU_array)):
102
+ name_value.append((label, iou))
103
+
104
+ name_value.append(('Pixel accuracy', pixel_accuracy))
105
+ name_value.append(('Mean accuracy', mean_accuracy))
106
+ name_value.append(('Mean IU', mean_IoU))
107
+ name_value = OrderedDict(name_value)
108
+ return name_value
109
+
110
+
111
+ def compute_mean_ioU_file(preds_dir, num_classes, datadir, dataset='val'):
112
+ list_path = os.path.join(datadir, dataset + '_id.txt')
113
+ val_id = [i_id.strip() for i_id in open(list_path)]
114
+
115
+ confusion_matrix = np.zeros((num_classes, num_classes))
116
+
117
+ for i, im_name in enumerate(val_id):
118
+ gt_path = os.path.join(datadir, 'segmentations', im_name + '.png')
119
+ gt = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)
120
+
121
+ pred_path = os.path.join(preds_dir, im_name + '.png')
122
+ pred = np.asarray(PILImage.open(pred_path))
123
+
124
+ gt = np.asarray(gt, dtype=np.int32)
125
+ pred = np.asarray(pred, dtype=np.int32)
126
+
127
+ ignore_index = gt != 255
128
+
129
+ gt = gt[ignore_index]
130
+ pred = pred[ignore_index]
131
+
132
+ confusion_matrix += get_confusion_matrix(gt, pred, num_classes)
133
+
134
+ pos = confusion_matrix.sum(1)
135
+ res = confusion_matrix.sum(0)
136
+ tp = np.diag(confusion_matrix)
137
+
138
+ pixel_accuracy = (tp.sum() / pos.sum()) * 100
139
+ mean_accuracy = ((tp / np.maximum(1.0, pos)).mean()) * 100
140
+ IoU_array = (tp / np.maximum(1.0, pos + res - tp))
141
+ IoU_array = IoU_array * 100
142
+ mean_IoU = IoU_array.mean()
143
+ print('Pixel accuracy: %f \n' % pixel_accuracy)
144
+ print('Mean accuracy: %f \n' % mean_accuracy)
145
+ print('Mean IU: %f \n' % mean_IoU)
146
+ name_value = []
147
+
148
+ for i, (label, iou) in enumerate(zip(LABELS, IoU_array)):
149
+ name_value.append((label, iou))
150
+
151
+ name_value.append(('Pixel accuracy', pixel_accuracy))
152
+ name_value.append(('Mean accuracy', mean_accuracy))
153
+ name_value.append(('Mean IU', mean_IoU))
154
+ name_value = OrderedDict(name_value)
155
+ return name_value
preprocess/humanparsing/utils/schp.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : schp.py
8
+ @Time : 4/8/19 2:11 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ import os
15
+ import torch
16
+ import modules
17
+
18
+ def moving_average(net1, net2, alpha=1):
19
+ for param1, param2 in zip(net1.parameters(), net2.parameters()):
20
+ param1.data *= (1.0 - alpha)
21
+ param1.data += param2.data * alpha
22
+
23
+
24
+ def _check_bn(module, flag):
25
+ if issubclass(module.__class__, modules.bn.InPlaceABNSync):
26
+ flag[0] = True
27
+
28
+
29
+ def check_bn(model):
30
+ flag = [False]
31
+ model.apply(lambda module: _check_bn(module, flag))
32
+ return flag[0]
33
+
34
+
35
+ def reset_bn(module):
36
+ if issubclass(module.__class__, modules.bn.InPlaceABNSync):
37
+ module.running_mean = torch.zeros_like(module.running_mean)
38
+ module.running_var = torch.ones_like(module.running_var)
39
+
40
+
41
+ def _get_momenta(module, momenta):
42
+ if issubclass(module.__class__, modules.bn.InPlaceABNSync):
43
+ momenta[module] = module.momentum
44
+
45
+
46
+ def _set_momenta(module, momenta):
47
+ if issubclass(module.__class__, modules.bn.InPlaceABNSync):
48
+ module.momentum = momenta[module]
49
+
50
+
51
+ def bn_re_estimate(loader, model):
52
+ if not check_bn(model):
53
+ print('No batch norm layer detected')
54
+ return
55
+ model.train()
56
+ momenta = {}
57
+ model.apply(reset_bn)
58
+ model.apply(lambda module: _get_momenta(module, momenta))
59
+ n = 0
60
+ for i_iter, batch in enumerate(loader):
61
+ images, labels, _ = batch
62
+ b = images.data.size(0)
63
+ momentum = b / (n + b)
64
+ for module in momenta.keys():
65
+ module.momentum = momentum
66
+ model(images)
67
+ n += b
68
+ model.apply(lambda module: _set_momenta(module, momenta))
69
+
70
+
71
+ def save_schp_checkpoint(states, is_best_parsing, output_dir, filename='schp_checkpoint.pth.tar'):
72
+ save_path = os.path.join(output_dir, filename)
73
+ if os.path.exists(save_path):
74
+ os.remove(save_path)
75
+ torch.save(states, save_path)
76
+ if is_best_parsing and 'state_dict' in states:
77
+ best_save_path = os.path.join(output_dir, 'model_parsing_best.pth.tar')
78
+ if os.path.exists(best_save_path):
79
+ os.remove(best_save_path)
80
+ torch.save(states, best_save_path)
preprocess/humanparsing/utils/soft_dice_loss.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- encoding: utf-8 -*-
3
+
4
+ """
5
+ @Author : Peike Li
6
+ @Contact : peike.li@yahoo.com
7
+ @File : soft_dice_loss.py
8
+ @Time : 8/13/19 5:09 PM
9
+ @Desc :
10
+ @License : This source code is licensed under the license found in the
11
+ LICENSE file in the root directory of this source tree.
12
+ """
13
+
14
+ from __future__ import print_function, division
15
+
16
+ import torch
17
+ import torch.nn.functional as F
18
+ from torch import nn
19
+
20
+ try:
21
+ from itertools import ifilterfalse
22
+ except ImportError: # py3k
23
+ from itertools import filterfalse as ifilterfalse
24
+
25
+
26
+ def tversky_loss(probas, labels, alpha=0.5, beta=0.5, epsilon=1e-6):
27
+ '''
28
+ Tversky loss function.
29
+ probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
30
+ labels: [P] Tensor, ground truth labels (between 0 and C - 1)
31
+
32
+ Same as soft dice loss when alpha=beta=0.5.
33
+ Same as Jaccord loss when alpha=beta=1.0.
34
+ See `Tversky loss function for image segmentation using 3D fully convolutional deep networks`
35
+ https://arxiv.org/pdf/1706.05721.pdf
36
+ '''
37
+ C = probas.size(1)
38
+ losses = []
39
+ for c in list(range(C)):
40
+ fg = (labels == c).float()
41
+ if fg.sum() == 0:
42
+ continue
43
+ class_pred = probas[:, c]
44
+ p0 = class_pred
45
+ p1 = 1 - class_pred
46
+ g0 = fg
47
+ g1 = 1 - fg
48
+ numerator = torch.sum(p0 * g0)
49
+ denominator = numerator + alpha * torch.sum(p0 * g1) + beta * torch.sum(p1 * g0)
50
+ losses.append(1 - ((numerator) / (denominator + epsilon)))
51
+ return mean(losses)
52
+
53
+
54
+ def flatten_probas(probas, labels, ignore=255):
55
+ """
56
+ Flattens predictions in the batch
57
+ """
58
+ B, C, H, W = probas.size()
59
+ probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C
60
+ labels = labels.view(-1)
61
+ if ignore is None:
62
+ return probas, labels
63
+ valid = (labels != ignore)
64
+ vprobas = probas[valid.nonzero().squeeze()]
65
+ vlabels = labels[valid]
66
+ return vprobas, vlabels
67
+
68
+
69
+ def isnan(x):
70
+ return x != x
71
+
72
+
73
+ def mean(l, ignore_nan=False, empty=0):
74
+ """
75
+ nanmean compatible with generators.
76
+ """
77
+ l = iter(l)
78
+ if ignore_nan:
79
+ l = ifilterfalse(isnan, l)
80
+ try:
81
+ n = 1
82
+ acc = next(l)
83
+ except StopIteration:
84
+ if empty == 'raise':
85
+ raise ValueError('Empty mean')
86
+ return empty
87
+ for n, v in enumerate(l, 2):
88
+ acc += v
89
+ if n == 1:
90
+ return acc
91
+ return acc / n
92
+
93
+
94
+ class SoftDiceLoss(nn.Module):
95
+ def __init__(self, ignore_index=255):
96
+ super(SoftDiceLoss, self).__init__()
97
+ self.ignore_index = ignore_index
98
+
99
+ def forward(self, pred, label):
100
+ pred = F.softmax(pred, dim=1)
101
+ return tversky_loss(*flatten_probas(pred, label, ignore=self.ignore_index), alpha=0.5, beta=0.5)
102
+
103
+
104
+ class SoftJaccordLoss(nn.Module):
105
+ def __init__(self, ignore_index=255):
106
+ super(SoftJaccordLoss, self).__init__()
107
+ self.ignore_index = ignore_index
108
+
109
+ def forward(self, pred, label):
110
+ pred = F.softmax(pred, dim=1)
111
+ return tversky_loss(*flatten_probas(pred, label, ignore=self.ignore_index), alpha=1.0, beta=1.0)
preprocess/humanparsing/utils/transforms.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Copyright (c) Microsoft
3
+ # Licensed under the MIT License.
4
+ # Written by Bin Xiao (Bin.Xiao@microsoft.com)
5
+ # ------------------------------------------------------------------------------
6
+
7
+ from __future__ import absolute_import
8
+ from __future__ import division
9
+ from __future__ import print_function
10
+
11
+ import numpy as np
12
+ import cv2
13
+ import torch
14
+
15
+ class BRG2Tensor_transform(object):
16
+ def __call__(self, pic):
17
+ img = torch.from_numpy(pic.transpose((2, 0, 1)))
18
+ if isinstance(img, torch.ByteTensor):
19
+ return img.float()
20
+ else:
21
+ return img
22
+
23
+ class BGR2RGB_transform(object):
24
+ def __call__(self, tensor):
25
+ return tensor[[2,1,0],:,:]
26
+
27
+ def flip_back(output_flipped, matched_parts):
28
+ '''
29
+ ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
30
+ '''
31
+ assert output_flipped.ndim == 4,\
32
+ 'output_flipped should be [batch_size, num_joints, height, width]'
33
+
34
+ output_flipped = output_flipped[:, :, :, ::-1]
35
+
36
+ for pair in matched_parts:
37
+ tmp = output_flipped[:, pair[0], :, :].copy()
38
+ output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
39
+ output_flipped[:, pair[1], :, :] = tmp
40
+
41
+ return output_flipped
42
+
43
+
44
+ def fliplr_joints(joints, joints_vis, width, matched_parts):
45
+ """
46
+ flip coords
47
+ """
48
+ # Flip horizontal
49
+ joints[:, 0] = width - joints[:, 0] - 1
50
+
51
+ # Change left-right parts
52
+ for pair in matched_parts:
53
+ joints[pair[0], :], joints[pair[1], :] = \
54
+ joints[pair[1], :], joints[pair[0], :].copy()
55
+ joints_vis[pair[0], :], joints_vis[pair[1], :] = \
56
+ joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
57
+
58
+ return joints*joints_vis, joints_vis
59
+
60
+
61
+ def transform_preds(coords, center, scale, input_size):
62
+ target_coords = np.zeros(coords.shape)
63
+ trans = get_affine_transform(center, scale, 0, input_size, inv=1)
64
+ for p in range(coords.shape[0]):
65
+ target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
66
+ return target_coords
67
+
68
+ def transform_parsing(pred, center, scale, width, height, input_size):
69
+
70
+ trans = get_affine_transform(center, scale, 0, input_size, inv=1)
71
+ target_pred = cv2.warpAffine(
72
+ pred,
73
+ trans,
74
+ (int(width), int(height)), #(int(width), int(height)),
75
+ flags=cv2.INTER_NEAREST,
76
+ borderMode=cv2.BORDER_CONSTANT,
77
+ borderValue=(0))
78
+
79
+ return target_pred
80
+
81
+ def transform_logits(logits, center, scale, width, height, input_size):
82
+
83
+ trans = get_affine_transform(center, scale, 0, input_size, inv=1)
84
+ channel = logits.shape[2]
85
+ target_logits = []
86
+ for i in range(channel):
87
+ target_logit = cv2.warpAffine(
88
+ logits[:,:,i],
89
+ trans,
90
+ (int(width), int(height)), #(int(width), int(height)),
91
+ flags=cv2.INTER_LINEAR,
92
+ borderMode=cv2.BORDER_CONSTANT,
93
+ borderValue=(0))
94
+ target_logits.append(target_logit)
95
+ target_logits = np.stack(target_logits,axis=2)
96
+
97
+ return target_logits
98
+
99
+
100
+ def get_affine_transform(center,
101
+ scale,
102
+ rot,
103
+ output_size,
104
+ shift=np.array([0, 0], dtype=np.float32),
105
+ inv=0):
106
+ if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
107
+ print(scale)
108
+ scale = np.array([scale, scale])
109
+
110
+ scale_tmp = scale
111
+
112
+ src_w = scale_tmp[0]
113
+ dst_w = output_size[1]
114
+ dst_h = output_size[0]
115
+
116
+ rot_rad = np.pi * rot / 180
117
+ src_dir = get_dir([0, src_w * -0.5], rot_rad)
118
+ dst_dir = np.array([0, (dst_w-1) * -0.5], np.float32)
119
+
120
+ src = np.zeros((3, 2), dtype=np.float32)
121
+ dst = np.zeros((3, 2), dtype=np.float32)
122
+ src[0, :] = center + scale_tmp * shift
123
+ src[1, :] = center + src_dir + scale_tmp * shift
124
+ dst[0, :] = [(dst_w-1) * 0.5, (dst_h-1) * 0.5]
125
+ dst[1, :] = np.array([(dst_w-1) * 0.5, (dst_h-1) * 0.5]) + dst_dir
126
+
127
+ src[2:, :] = get_3rd_point(src[0, :], src[1, :])
128
+ dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
129
+
130
+ if inv:
131
+ trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
132
+ else:
133
+ trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
134
+
135
+ return trans
136
+
137
+
138
+ def affine_transform(pt, t):
139
+ new_pt = np.array([pt[0], pt[1], 1.]).T
140
+ new_pt = np.dot(t, new_pt)
141
+ return new_pt[:2]
142
+
143
+
144
+ def get_3rd_point(a, b):
145
+ direct = a - b
146
+ return b + np.array([-direct[1], direct[0]], dtype=np.float32)
147
+
148
+
149
+ def get_dir(src_point, rot_rad):
150
+ sn, cs = np.sin(rot_rad), np.cos(rot_rad)
151
+
152
+ src_result = [0, 0]
153
+ src_result[0] = src_point[0] * cs - src_point[1] * sn
154
+ src_result[1] = src_point[0] * sn + src_point[1] * cs
155
+
156
+ return src_result
157
+
158
+
159
+ def crop(img, center, scale, output_size, rot=0):
160
+ trans = get_affine_transform(center, scale, rot, output_size)
161
+
162
+ dst_img = cv2.warpAffine(img,
163
+ trans,
164
+ (int(output_size[1]), int(output_size[0])),
165
+ flags=cv2.INTER_LINEAR)
166
+
167
+ return dst_img