Spaces:

Benrise
/

VITON-HD

Runtime error

App Files Files Community

Benrise commited on Jun 23

Commit

9b63413

1 Parent(s): 39ecbc6

Add VITON implementation with UI

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +2 -0
README.md +28 -8
app.py +150 -0
configs/VITONHD.yaml +32 -0
lib/caption.py +19 -0
lib/mask.py +64 -0
lib/pose.py +36 -0
preprocess/__init__.py +0 -0
preprocess/humanparsing/__init__.py +0 -0
preprocess/humanparsing/datasets/__init__.py +0 -0
preprocess/humanparsing/datasets/datasets.py +201 -0
preprocess/humanparsing/datasets/simple_extractor_dataset.py +89 -0
preprocess/humanparsing/datasets/target_generation.py +40 -0
preprocess/humanparsing/modules/__init__.py +5 -0
preprocess/humanparsing/modules/bn.py +132 -0
preprocess/humanparsing/modules/deeplab.py +84 -0
preprocess/humanparsing/modules/dense.py +42 -0
preprocess/humanparsing/modules/functions.py +245 -0
preprocess/humanparsing/modules/misc.py +21 -0
preprocess/humanparsing/modules/residual.py +182 -0
preprocess/humanparsing/modules/src/checks.h +15 -0
preprocess/humanparsing/modules/src/inplace_abn.cpp +95 -0
preprocess/humanparsing/modules/src/inplace_abn.h +88 -0
preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp +119 -0
preprocess/humanparsing/modules/src/inplace_abn_cuda.cu +333 -0
preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu +275 -0
preprocess/humanparsing/modules/src/utils/checks.h +15 -0
preprocess/humanparsing/modules/src/utils/common.h +49 -0
preprocess/humanparsing/modules/src/utils/cuda.cuh +71 -0
preprocess/humanparsing/networks/AugmentCE2P.py +388 -0
preprocess/humanparsing/networks/__init__.py +12 -0
preprocess/humanparsing/networks/backbone/mobilenetv2.py +156 -0
preprocess/humanparsing/networks/backbone/resnet.py +205 -0
preprocess/humanparsing/networks/backbone/resnext.py +149 -0
preprocess/humanparsing/networks/context_encoding/aspp.py +64 -0
preprocess/humanparsing/networks/context_encoding/ocnet.py +226 -0
preprocess/humanparsing/networks/context_encoding/psp.py +48 -0
preprocess/humanparsing/parsing_api.py +191 -0
preprocess/humanparsing/run_parsing.py +44 -0
preprocess/humanparsing/utils/__init__.py +0 -0
preprocess/humanparsing/utils/consistency_loss.py +33 -0
preprocess/humanparsing/utils/criterion.py +142 -0
preprocess/humanparsing/utils/encoding.py +187 -0
preprocess/humanparsing/utils/kl_loss.py +43 -0
preprocess/humanparsing/utils/lovasz_softmax.py +279 -0
preprocess/humanparsing/utils/miou.py +155 -0
preprocess/humanparsing/utils/schp.py +80 -0
preprocess/humanparsing/utils/soft_dice_loss.py +111 -0
preprocess/humanparsing/utils/transforms.py +167 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ checkpoints

README.md CHANGED Viewed

@@ -1,14 +1,34 @@
 ---
-title: VITON HD
-emoji: 🌍
-colorFrom: green
-colorTo: indigo
 sdk: gradio
 sdk_version: 5.34.2
 app_file: app.py
-pinned: false
-license: cc-by-nc-sa-4.0
-short_description: Virtual try-on
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Virtual Try-On
+emoji: 👗
+colorFrom: pink
+colorTo: purple
 sdk: gradio
 sdk_version: 5.34.2
 app_file: app.py
+pinned: true
 ---
+# Virtual Try-On Demo
+This repository is the work demo implementation of [PromptDresser](https://arxiv.org/abs/2412.16978)
+> **PromptDresser: Improving the Quality and Controllability of Virtual Try-On via Generative Textual Prompt and Prompt-aware Mask**<br>
+> [Jeongho Kim](https://scholar.google.co.kr/citations?user=4SCCBFwAAAAJ&hl=ko), [Hoiyeong Jin](https://scholar.google.com/citations?user=Jp-zhtUAAAAJ&hl=en), [Sunghyun Park](https://psh01087.github.io/), [Jaegul Choo](https://sites.google.com/site/jaegulchoo/)
+[[arXiv Paper](https://arxiv.org/abs/2412.16978)]&nbsp;
+## Citation
+```
+@misc{kim2024promptdresserimprovingqualitycontrollability,
+      title={PromptDresser: Improving the Quality and Controllability of Virtual Try-On via Generative Textual Prompt and Prompt-aware Mask},
+      author={Jeongho Kim and Hoiyeong Jin and Sunghyun Park and Jaegul Choo},
+      year={2024},
+      eprint={2412.16978},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2412.16978},
+}
+```
+## License
+Licensed under the CC BY-NC-SA 4.0 license (https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode).

app.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import os
+import torch
+import gradio as gr
+import tempfile
+from huggingface_hub import hf_hub_download
+from diffusers import AutoencoderKL, DDPMScheduler
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
+from promptdresser.models.unet import UNet2DConditionModel
+from promptdresser.models.cloth_encoder import ClothEncoder
+from promptdresser.pipelines.sdxl import PromptDresser
+from lib.caption import generate_caption
+from lib.mask import generate_clothing_mask
+from lib.pose import generate_openpose
+device = "cuda" if torch.cuda.is_available() else "cpu"
+weight_dtype = torch.float16 if device == "cuda" else torch.float32
+def load_models():
+    print("⚙️ Загрузка моделей...")
+    noise_scheduler = DDPMScheduler.from_pretrained(
+        "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
+        subfolder="scheduler"
+    )
+    tokenizer = CLIPTokenizer.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="text_encoder")
+    tokenizer_2 = CLIPTokenizer.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="tokenizer_2")
+    text_encoder_2 = CLIPTextModelWithProjection.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="text_encoder_2")
+    vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix")
+    unet = UNet2DConditionModel.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", subfolder="unet")
+    cloth_encoder = ClothEncoder.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet")
+    unet_checkpoint_path = hf_hub_download(
+        repo_id="Benrise/VITON-HD",
+        filename="VITONHD/model/pytorch_model.bin",
+        cache_dir="checkpoints"
+    )
+    unet.load_state_dict(torch.load(unet_checkpoint_path))
+    models = {
+        "unet": unet.to(device, dtype=weight_dtype),
+        "vae": vae.to(device, dtype=weight_dtype),
+        "text_encoder": text_encoder.to(device, dtype=weight_dtype),
+        "text_encoder_2": text_encoder_2.to(device, dtype=weight_dtype),
+        "cloth_encoder": cloth_encoder.to(device, dtype=weight_dtype),
+        "noise_scheduler": noise_scheduler,
+        "tokenizer": tokenizer,
+        "tokenizer_2": tokenizer_2
+    }
+    pipeline = PromptDresser(
+        vae=models["vae"],
+        text_encoder=models["text_encoder"],
+        text_encoder_2=models["text_encoder_2"],
+        tokenizer=models["tokenizer"],
+        tokenizer_2=models["tokenizer_2"],
+        unet=models["unet"],
+        scheduler=models["noise_scheduler"],
+    ).to(device, dtype=weight_dtype)
+    return {**models, "pipeline": pipeline}
+models = load_models()
+pipeline = models["pipeline"]
+def generate_vton(person_image, cloth_image, outfit_prompt="", clothing_prompt=""):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        person_path = os.path.join(tmp_dir, "person.png")
+        cloth_path = os.path.join(tmp_dir, "cloth.png")
+        person_image.save(person_path)
+        cloth_image.save(cloth_path)
+        mask_path = os.path.join(tmp_dir, "mask.png")
+        pose_path = os.path.join(tmp_dir, "pose.png")
+        mask_image = generate_clothing_mask(person_path, label=4, output_path=mask_path, show_result=False)
+        pose_image = generate_openpose(person_path, output_image_path=pose_path, show_result=False)
+        auto_outfit_prompt = generate_caption(person_path, device)
+        auto_clothing_prompt = generate_caption(cloth_path, device)
+        final_outfit_prompt = outfit_prompt or auto_outfit_prompt
+        final_clothing_prompt = clothing_prompt or auto_clothing_prompt
+        with torch.autocast(device):
+            result = pipeline(
+                image=person_image,
+                mask_image=mask_image,
+                pose_image=pose_image,
+                cloth_encoder=models["cloth_encoder"],
+                cloth_encoder_image=cloth_image,
+                prompt=final_outfit_prompt,
+                prompt_clothing=final_clothing_prompt,
+                height=1024,
+                width=768,
+                guidance_scale=2.0,
+                guidance_scale_img=4.5,
+                guidance_scale_text=7.5,
+                num_inference_steps=30,
+                strength=1,
+                interm_cloth_start_ratio=0.5,
+                generator=None,
+            ).images[0]
+    return result
+with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container") as demo:
+    gr.Markdown("# 🧥 Virtual Try-On")
+    gr.Markdown("Загрузите фото человека и одежды для виртуальной примерки")
+    with gr.Row():
+        with gr.Column():
+            person_input = gr.Image(label="Фото человека", type="pil", sources=["upload"])
+            cloth_input = gr.Image(label="Фото одежды", type="pil", sources=["upload"])
+            outfit_prompt = gr.Textbox(label="Описание образа (опционально)", placeholder="Например: man in casual outfit")
+            clothing_prompt = gr.Textbox(label="Описание одежды (опционально)", placeholder="Например: red t-shirt with print")
+            generate_btn = gr.Button("Сгенерировать примерку", variant="primary")
+            gr.Examples(
+                examples=[
+                    ["./test/person2.png", "./test/00008_00.jpg", "man in skirt", "black longsleeve"]
+                ],
+                inputs=[person_input, cloth_input, outfit_prompt, clothing_prompt],
+                label="Примеры для быстрого тестирования"
+            )
+        with gr.Column():
+            output_image = gr.Image(label="Результат примерки", interactive=False)
+    generate_btn.click(
+        fn=generate_vton,
+        inputs=[person_input, cloth_input, outfit_prompt, clothing_prompt],
+        outputs=output_image
+    )
+    gr.Markdown("### Инструкция:")
+    gr.Markdown("1. Загрузите четкое фото человека в полный рост\n"
+                "2. Загрузите фото одежды на белом фоне\n"
+                "3. При необходимости уточните описание образа или одежды\n"
+                "4. Нажмите кнопку 'Сгенерировать примерку'")
+if __name__ == "__main__":
+    demo.queue(max_size=3).launch(
+        server_name="0.0.0.0" if os.getenv("SPACE_ID") else None,
+        share=os.getenv("GRADIO_SHARE") == "True",
+        debug=True
+    )

configs/VITONHD.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+no_pose: True
+use_jointcond: True
+no_ipadapter: True
+use_interm_cloth_mask: True
+interm_cloth_start_ratio: 0.5
+dataset:
+  dataset_name: "VITONHDDataset"
+  data_root_dir: "./DATA/zalando-hd-resized"
+  img_spatial_transform_lst:
+  - "randomresizedcrop"
+  - "randomaffine"
+  cloth_spatial_transform_lst:
+  - "randomresizedcrop"
+  - "randomaffine"
+  img_cloth_spatial_transform_lst:
+  - "hflip"
+  color_transform_lst:
+  - "colorjitter"
+  i_drop_rate: 0.05
+  pose_type: "densepose"
+  train_folder_name: train_fine
+  test_folder_name: test_fine
+  prompt_version: v12
+  text_file_postfix: "gpt4o.json"
+  train_folder_name_for_interm_cloth_mask: train_coarse
+  test_folder_name_for_interm_cloth_mask: test_coarse
+  use_rand_dilate: True
+  rand_dilate_miniter: 0
+  rand_dilate_maxiter: 200

lib/caption.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM
+def generate_caption(image_path, device="cuda"):
+    print("Генерация подписи...")
+    processor = AutoProcessor.from_pretrained("microsoft/git-base", use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained("microsoft/git-base").to(device)
+    image = Image.open(image_path).convert("RGB")
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        pixel_values=inputs.pixel_values,
+        max_length=50,
+        pad_token_id=processor.tokenizer.pad_token_id
+    )
+    caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    print("Сгенерированная подпись:", caption)
+    return caption

lib/mask.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from transformers import SegformerImageProcessor, AutoModelForSemanticSegmentation
+from PIL import Image
+import numpy as np
+import requests
+import torch.nn.functional as F
+import torch
+import os
+def generate_clothing_mask(
+    image_path: str,
+    label: int,
+    output_path: str = "./output_mask.png",
+    model_name: str = "mattmdjaga/segformer_b2_clothes",
+    show_result: bool = False
+) -> Image.Image:
+    """
+    Генерирует бинарную маску для указанного класса одежды и сохраняет её
+    Args:
+        image_path: Путь к изображению или URL
+        label: Класс для сегментации (0-17)
+        output_path: Путь для сохранения маски
+        model_name: Название модели HuggingFace
+        show_result: Показать результат matplotlib
+    Returns:
+        PIL.Image: Бинарная маска (белый - выбранный класс, черный - остальное)
+    """
+    processor = SegformerImageProcessor.from_pretrained(model_name)
+    model = AutoModelForSemanticSegmentation.from_pretrained(model_name)
+    if image_path.startswith(('http://', 'https://')):
+        image = Image.open(requests.get(image_path, stream=True).raw)
+    else:
+        image = Image.open(image_path)
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    image_np = np.array(image)
+    if len(image_np.shape) != 3 or image_np.shape[2] != 3:
+        raise ValueError("Изображение должно быть в формате RGB (H, W, 3)")
+    inputs = processor(images=image, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    upsampled_logits = F.interpolate(
+        logits,
+        size=image.size[::-1],
+        mode="bilinear",
+        align_corners=False,
+    )
+    pred_seg = upsampled_logits.argmax(dim=1)[0]
+    mask = (pred_seg == label).numpy().astype('uint8') * 255
+    mask_image = Image.fromarray(mask)
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    mask_image.save(output_path)
+    return mask_image

lib/pose.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from controlnet_aux import OpenposeDetector
+from PIL import Image
+import torch
+def generate_openpose(
+    input_image_path: str,
+    output_image_path: str = None,
+    device: str = "cuda" if torch.cuda.is_available() else "cpu",
+    show_result: bool = False
+) -> Image.Image:
+    """
+    Генерирует OpenPose карту позы из входного изображения.
+    Параметры:
+        input_image_path (str): Путь к исходному изображению
+        output_image_path (str, optional): Путь для сохранения результата. Если None - не сохраняется.
+        device (str): Устройство для обработки ('cuda' или 'cpu')
+        show_result (bool): Показывать ли результат сразу
+    Возвращает:
+        Image.Image: Изображение с OpenPose картой позы
+    """
+    openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet").to(device)
+    image = Image.open(input_image_path).convert("RGB")
+    openpose_map = openpose(image)
+    if output_image_path:
+        openpose_map.save(output_image_path)
+    if show_result:
+        openpose_map.show()
+    return image

preprocess/__init__.py ADDED Viewed

File without changes

preprocess/humanparsing/__init__.py ADDED Viewed

File without changes

preprocess/humanparsing/datasets/__init__.py ADDED Viewed

File without changes

preprocess/humanparsing/datasets/datasets.py ADDED Viewed

	@@ -0,0 +1,201 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   datasets.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import os
+import numpy as np
+import random
+import torch
+import cv2
+from torch.utils import data
+from utils.transforms import get_affine_transform
+class LIPDataSet(data.Dataset):
+    def __init__(self, root, dataset, crop_size=[473, 473], scale_factor=0.25,
+                 rotation_factor=30, ignore_label=255, transform=None):
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+        self.ignore_label = ignore_label
+        self.scale_factor = scale_factor
+        self.rotation_factor = rotation_factor
+        self.flip_prob = 0.5
+        self.transform = transform
+        self.dataset = dataset
+        list_path = os.path.join(self.root, self.dataset + '_id.txt')
+        train_list = [i_id.strip() for i_id in open(list_path)]
+        self.train_list = train_list
+        self.number_samples = len(self.train_list)
+    def __len__(self):
+        return self.number_samples
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+        return center, scale
+    def __getitem__(self, index):
+        train_item = self.train_list[index]
+        im_path = os.path.join(self.root, self.dataset + '_images', train_item + '.jpg')
+        parsing_anno_path = os.path.join(self.root, self.dataset + '_segmentations', train_item + '.png')
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        parsing_anno = np.zeros((h, w), dtype=np.long)
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        if self.dataset != 'test':
+            # Get pose annotation
+            parsing_anno = cv2.imread(parsing_anno_path, cv2.IMREAD_GRAYSCALE)
+            if self.dataset == 'train' or self.dataset == 'trainval':
+                sf = self.scale_factor
+                rf = self.rotation_factor
+                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+                r = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if random.random() <= 0.6 else 0
+                if random.random() <= self.flip_prob:
+                    im = im[:, ::-1, :]
+                    parsing_anno = parsing_anno[:, ::-1]
+                    person_center[0] = im.shape[1] - person_center[0] - 1
+                    right_idx = [15, 17, 19]
+                    left_idx = [14, 16, 18]
+                    for i in range(0, 3):
+                        right_pos = np.where(parsing_anno == right_idx[i])
+                        left_pos = np.where(parsing_anno == left_idx[i])
+                        parsing_anno[right_pos[0], right_pos[1]] = left_idx[i]
+                        parsing_anno[left_pos[0], left_pos[1]] = right_idx[i]
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        if self.transform:
+            input = self.transform(input)
+        meta = {
+            'name': train_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        if self.dataset == 'val' or self.dataset == 'test':
+            return input, meta
+        else:
+            label_parsing = cv2.warpAffine(
+                parsing_anno,
+                trans,
+                (int(self.crop_size[1]), int(self.crop_size[0])),
+                flags=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=(255))
+            label_parsing = torch.from_numpy(label_parsing)
+            return input, label_parsing, meta
+class LIPDataValSet(data.Dataset):
+    def __init__(self, root, dataset='val', crop_size=[473, 473], transform=None, flip=False):
+        self.root = root
+        self.crop_size = crop_size
+        self.transform = transform
+        self.flip = flip
+        self.dataset = dataset
+        self.root = root
+        self.aspect_ratio = crop_size[1] * 1.0 / crop_size[0]
+        self.crop_size = np.asarray(crop_size)
+        list_path = os.path.join(self.root, self.dataset + '_id.txt')
+        val_list = [i_id.strip() for i_id in open(list_path)]
+        self.val_list = val_list
+        self.number_samples = len(self.val_list)
+    def __len__(self):
+        return len(self.val_list)
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+        return center, scale
+    def __getitem__(self, index):
+        val_item = self.val_list[index]
+        # Load training image
+        im_path = os.path.join(self.root, self.dataset + '_images', val_item + '.jpg')
+        im = cv2.imread(im_path, cv2.IMREAD_COLOR)
+        h, w, _ = im.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.crop_size)
+        input = cv2.warpAffine(
+            im,
+            trans,
+            (int(self.crop_size[1]), int(self.crop_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        input = self.transform(input)
+        flip_input = input.flip(dims=[-1])
+        if self.flip:
+            batch_input_im = torch.stack([input, flip_input])
+        else:
+            batch_input_im = input
+        meta = {
+            'name': val_item,
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        return batch_input_im, meta

preprocess/humanparsing/datasets/simple_extractor_dataset.py ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   dataset.py
+@Time    :   8/30/19 9:12 PM
+@Desc    :   Dataset Definition
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import os
+import pdb
+import cv2
+import numpy as np
+from PIL import Image
+from torch.utils import data
+from utils.transforms import get_affine_transform
+class SimpleFolderDataset(data.Dataset):
+    def __init__(self, root, input_size=[512, 512], transform=None):
+        self.root = root
+        self.input_size = input_size
+        self.transform = transform
+        self.aspect_ratio = input_size[1] * 1.0 / input_size[0]
+        self.input_size = np.asarray(input_size)
+        self.is_pil_image = False
+        if isinstance(root, Image.Image):
+            self.file_list = [root]
+            self.is_pil_image = True
+        elif os.path.isfile(root):
+            self.file_list = [os.path.basename(root)]
+            self.root = os.path.dirname(root)
+        else:
+            self.file_list = os.listdir(self.root)
+    def __len__(self):
+        return len(self.file_list)
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        return self._xywh2cs(x, y, w, h)
+    def _xywh2cs(self, x, y, w, h):
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array([w, h], dtype=np.float32)
+        return center, scale
+    def __getitem__(self, index):
+        if self.is_pil_image:
+            img = np.asarray(self.file_list[index])[:, :, [2, 1, 0]]
+        else:
+            img_name = self.file_list[index]
+            img_path = os.path.join(self.root, img_name)
+            img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+        h, w, _ = img.shape
+        # Get person center and scale
+        person_center, s = self._box2cs([0, 0, w - 1, h - 1])
+        r = 0
+        trans = get_affine_transform(person_center, s, r, self.input_size)
+        input = cv2.warpAffine(
+            img,
+            trans,
+            (int(self.input_size[1]), int(self.input_size[0])),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0, 0, 0))
+        input = self.transform(input)
+        meta = {
+            'center': person_center,
+            'height': h,
+            'width': w,
+            'scale': s,
+            'rotation': r
+        }
+        return input, meta

preprocess/humanparsing/datasets/target_generation.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch.nn import functional as F
+def generate_edge_tensor(label, edge_width=3):
+    label = label.type(torch.cuda.FloatTensor)
+    if len(label.shape) == 2:
+        label = label.unsqueeze(0)
+    n, h, w = label.shape
+    edge = torch.zeros(label.shape, dtype=torch.float).cuda()
+    # right
+    edge_right = edge[:, 1:h, :]
+    edge_right[(label[:, 1:h, :] != label[:, :h - 1, :]) & (label[:, 1:h, :] != 255)
+               & (label[:, :h - 1, :] != 255)] = 1
+    # up
+    edge_up = edge[:, :, :w - 1]
+    edge_up[(label[:, :, :w - 1] != label[:, :, 1:w])
+            & (label[:, :, :w - 1] != 255)
+            & (label[:, :, 1:w] != 255)] = 1
+    # upright
+    edge_upright = edge[:, :h - 1, :w - 1]
+    edge_upright[(label[:, :h - 1, :w - 1] != label[:, 1:h, 1:w])
+                 & (label[:, :h - 1, :w - 1] != 255)
+                 & (label[:, 1:h, 1:w] != 255)] = 1
+    # bottomright
+    edge_bottomright = edge[:, :h - 1, 1:w]
+    edge_bottomright[(label[:, :h - 1, 1:w] != label[:, 1:h, :w - 1])
+                     & (label[:, :h - 1, 1:w] != 255)
+                     & (label[:, 1:h, :w - 1] != 255)] = 1
+    kernel = torch.ones((1, 1, edge_width, edge_width), dtype=torch.float).cuda()
+    with torch.no_grad():
+        edge = edge.unsqueeze(1)
+        edge = F.conv2d(edge, kernel, stride=1, padding=1)
+    edge[edge!=0] = 1
+    edge = edge.squeeze()
+    return edge

preprocess/humanparsing/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .bn import ABN, InPlaceABN, InPlaceABNSync
+from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
+from .misc import GlobalAvgPool2d, SingleGPU
+from .residual import IdentityResidualBlock
+from .dense import DenseModule

preprocess/humanparsing/modules/bn.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+try:
+    from queue import Queue
+except ImportError:
+    from Queue import Queue
+from .functions import *
+class ABN(nn.Module):
+    """Activated Batch Normalization
+    This gathers a `BatchNorm2d` and an activation function in a single module
+    """
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an Activated Batch Normalization module
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(ABN, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        self.activation = activation
+        self.slope = slope
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(num_features))
+            self.bias = nn.Parameter(torch.zeros(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.constant_(self.running_mean, 0)
+        nn.init.constant_(self.running_var, 1)
+        if self.affine:
+            nn.init.constant_(self.weight, 1)
+            nn.init.constant_(self.bias, 0)
+    def forward(self, x):
+        x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias,
+                                  self.training, self.momentum, self.eps)
+        if self.activation == ACT_RELU:
+            return functional.relu(x, inplace=True)
+        elif self.activation == ACT_LEAKY_RELU:
+            return functional.leaky_relu(x, negative_slope=self.slope, inplace=True)
+        elif self.activation == ACT_ELU:
+            return functional.elu(x, inplace=True)
+        else:
+            return x
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)
+class InPlaceABN(ABN):
+    """InPlace Activated Batch Normalization"""
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01):
+        """Creates an InPlace Activated Batch Normalization module
+        Parameters
+        ----------
+        num_features : int
+            Number of feature channels in the input and output.
+        eps : float
+            Small constant to prevent numerical issues.
+        momentum : float
+            Momentum factor applied to compute running statistics as.
+        affine : bool
+            If `True` apply learned scale and shift transformation after normalization.
+        activation : str
+            Name of the activation functions, one of: `leaky_relu`, `elu` or `none`.
+        slope : float
+            Negative slope for the `leaky_relu` activation.
+        """
+        super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope)
+    def forward(self, x):
+        x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var,
+                           self.training, self.momentum, self.eps, self.activation, self.slope)
+        return x
+class InPlaceABNSync(ABN):
+    """InPlace Activated Batch Normalization with cross-GPU synchronization
+    This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`.
+    """
+    def forward(self, x):
+        x, _, _ =  inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var,
+                                   self.training, self.momentum, self.eps, self.activation, self.slope)
+        return x
+    def __repr__(self):
+        rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \
+              ' affine={affine}, activation={activation}'
+        if self.activation == "leaky_relu":
+            rep += ', slope={slope})'
+        else:
+            rep += ')'
+        return rep.format(name=self.__class__.__name__, **self.__dict__)

preprocess/humanparsing/modules/deeplab.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as functional
+from models._util import try_index
+from .bn import ABN
+class DeeplabV3(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 hidden_channels=256,
+                 dilations=(12, 24, 36),
+                 norm_act=ABN,
+                 pooling_size=None):
+        super(DeeplabV3, self).__init__()
+        self.pooling_size = pooling_size
+        self.map_convs = nn.ModuleList([
+            nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]),
+            nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2])
+        ])
+        self.map_bn = norm_act(hidden_channels * 4)
+        self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False)
+        self.global_pooling_bn = norm_act(hidden_channels)
+        self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False)
+        self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False)
+        self.red_bn = norm_act(out_channels)
+        self.reset_parameters(self.map_bn.activation, self.map_bn.slope)
+    def reset_parameters(self, activation, slope):
+        gain = nn.init.calculate_gain(activation, slope)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_normal_(m.weight.data, gain)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, ABN):
+                if hasattr(m, "weight") and m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if hasattr(m, "bias") and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        # Map convolutions
+        out = torch.cat([m(x) for m in self.map_convs], dim=1)
+        out = self.map_bn(out)
+        out = self.red_conv(out)
+        # Global pooling
+        pool = self._global_pooling(x)
+        pool = self.global_pooling_conv(pool)
+        pool = self.global_pooling_bn(pool)
+        pool = self.pool_red_conv(pool)
+        if self.training or self.pooling_size is None:
+            pool = pool.repeat(1, 1, x.size(2), x.size(3))
+        out += pool
+        out = self.red_bn(out)
+        return out
+    def _global_pooling(self, x):
+        if self.training or self.pooling_size is None:
+            pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1)
+            pool = pool.view(x.size(0), x.size(1), 1, 1)
+        else:
+            pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]),
+                            min(try_index(self.pooling_size, 1), x.shape[3]))
+            padding = (
+                (pooling_size[1] - 1) // 2,
+                (pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1,
+                (pooling_size[0] - 1) // 2,
+                (pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1
+            )
+            pool = functional.avg_pool2d(x, pooling_size, stride=1)
+            pool = functional.pad(pool, pad=padding, mode="replicate")
+        return pool

preprocess/humanparsing/modules/dense.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from collections import OrderedDict
+import torch
+import torch.nn as nn
+from .bn import ABN
+class DenseModule(nn.Module):
+    def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1):
+        super(DenseModule, self).__init__()
+        self.in_channels = in_channels
+        self.growth = growth
+        self.layers = layers
+        self.convs1 = nn.ModuleList()
+        self.convs3 = nn.ModuleList()
+        for i in range(self.layers):
+            self.convs1.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(in_channels)),
+                ("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False))
+            ])))
+            self.convs3.append(nn.Sequential(OrderedDict([
+                ("bn", norm_act(self.growth * bottleneck_factor)),
+                ("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False,
+                                   dilation=dilation))
+            ])))
+            in_channels += self.growth
+    @property
+    def out_channels(self):
+        return self.in_channels + self.growth * self.layers
+    def forward(self, x):
+        inputs = [x]
+        for i in range(self.layers):
+            x = torch.cat(inputs, dim=1)
+            x = self.convs1[i](x)
+            x = self.convs3[i](x)
+            inputs += [x]
+        return torch.cat(inputs, dim=1)

preprocess/humanparsing/modules/functions.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import pdb
+from os import path
+import torch
+import torch.distributed as dist
+import torch.autograd as autograd
+import torch.cuda.comm as comm
+from torch.autograd.function import once_differentiable
+from torch.utils.cpp_extension import load
+_src_path = path.join(path.dirname(path.abspath(__file__)), "src")
+_backend = load(name="inplace_abn",
+                extra_cflags=["-O3"],
+                sources=[path.join(_src_path, f) for f in [
+                    "inplace_abn.cpp",
+                    "inplace_abn_cpu.cpp",
+                    "inplace_abn_cuda.cu",
+                    "inplace_abn_cuda_half.cu"
+                ]],
+                extra_cuda_cflags=["--expt-extended-lambda"])
+# Activation names
+ACT_RELU = "relu"
+ACT_LEAKY_RELU = "leaky_relu"
+ACT_ELU = "elu"
+ACT_NONE = "none"
+def _check(fn, *args, **kwargs):
+    success = fn(*args, **kwargs)
+    if not success:
+        raise RuntimeError("CUDA Error encountered in {}".format(fn))
+def _broadcast_shape(x):
+    out_size = []
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            out_size.append(1)
+        else:
+            out_size.append(s)
+    return out_size
+def _reduce(x):
+    if len(x.size()) == 2:
+        return x.sum(dim=0)
+    else:
+        n, c = x.size()[0:2]
+        return x.contiguous().view((n, c, -1)).sum(2).sum(0)
+def _count_samples(x):
+    count = 1
+    for i, s in enumerate(x.size()):
+        if i != 1:
+            count *= s
+    return count
+def _act_forward(ctx, x):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_forward(x, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_forward(x)
+    elif ctx.activation == ACT_NONE:
+        pass
+def _act_backward(ctx, x, dx):
+    if ctx.activation == ACT_LEAKY_RELU:
+        _backend.leaky_relu_backward(x, dx, ctx.slope)
+    elif ctx.activation == ACT_ELU:
+        _backend.elu_backward(x, dx)
+    elif ctx.activation == ACT_NONE:
+        pass
+class InPlaceABN(autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+        # Prepare inputs
+        count = _count_samples(x)
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1))
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        ctx.mark_non_differentiable(running_mean, running_var)
+        return x, running_mean, running_var
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz, _drunning_mean, _drunning_var):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+        # Undo activation
+        _act_backward(ctx, z, dz)
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+        else:
+            # TODO: implement simplified CUDA backward for inference mode
+            edz = dz.new_zeros(dz.size(1))
+            eydz = dz.new_zeros(dz.size(1))
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        # dweight = eydz * weight.sign() if ctx.affine else None
+        dweight = eydz if ctx.affine else None
+        if dweight is not None:
+            dweight[weight < 0] *= -1
+        dbias = edz if ctx.affine else None
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+class InPlaceABNSync(autograd.Function):
+    @classmethod
+    def forward(cls, ctx, x, weight, bias, running_mean, running_var,
+                training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True):
+        # Save context
+        ctx.training = training
+        ctx.momentum = momentum
+        ctx.eps = eps
+        ctx.activation = activation
+        ctx.slope = slope
+        ctx.affine = weight is not None and bias is not None
+        # Prepare inputs
+        ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1
+        # count = _count_samples(x)
+        batch_size = x.new_tensor([x.shape[0]], dtype=torch.long)
+        x = x.contiguous()
+        weight = weight.contiguous() if ctx.affine else x.new_empty(0)
+        bias = bias.contiguous() if ctx.affine else x.new_empty(0)
+        if ctx.training:
+            mean, var = _backend.mean_var(x)
+            if ctx.world_size > 1:
+                # get global batch size
+                if equal_batches:
+                    batch_size *= ctx.world_size
+                else:
+                    dist.all_reduce(batch_size, dist.ReduceOp.SUM)
+                ctx.factor = x.shape[0] / float(batch_size.item())
+                mean_all = mean.clone() * ctx.factor
+                dist.all_reduce(mean_all, dist.ReduceOp.SUM)
+                var_all = (var + (mean - mean_all) ** 2) * ctx.factor
+                dist.all_reduce(var_all, dist.ReduceOp.SUM)
+                mean = mean_all
+                var = var_all
+            # Update running stats
+            running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean)
+            count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1]
+            running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1)))
+            # Mark in-place modified tensors
+            ctx.mark_dirty(x, running_mean, running_var)
+        else:
+            mean, var = running_mean.contiguous(), running_var.contiguous()
+            ctx.mark_dirty(x)
+        # BN forward + activation
+        _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps)
+        _act_forward(ctx, x)
+        # Output
+        ctx.var = var
+        ctx.save_for_backward(x, var, weight, bias)
+        ctx.mark_non_differentiable(running_mean, running_var)
+        return x, running_mean, running_var
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, dz, _drunning_mean, _drunning_var):
+        z, var, weight, bias = ctx.saved_tensors
+        dz = dz.contiguous()
+        # Undo activation
+        _act_backward(ctx, z, dz)
+        if ctx.training:
+            edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps)
+            edz_local = edz.clone()
+            eydz_local = eydz.clone()
+            if ctx.world_size > 1:
+                edz *= ctx.factor
+                dist.all_reduce(edz, dist.ReduceOp.SUM)
+                eydz *= ctx.factor
+                dist.all_reduce(eydz, dist.ReduceOp.SUM)
+        else:
+            edz_local = edz = dz.new_zeros(dz.size(1))
+            eydz_local = eydz = dz.new_zeros(dz.size(1))
+        dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps)
+        # dweight = eydz_local * weight.sign() if ctx.affine else None
+        dweight = eydz_local if ctx.affine else None
+        if dweight is not None:
+            dweight[weight < 0] *= -1
+        dbias = edz_local if ctx.affine else None
+        return dx, dweight, dbias, None, None, None, None, None, None, None
+inplace_abn = InPlaceABN.apply
+inplace_abn_sync = InPlaceABNSync.apply
+__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"]

preprocess/humanparsing/modules/misc.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch.nn as nn
+import torch
+import torch.distributed as dist
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        """Global average pooling over the input's spatial dimensions"""
+        super(GlobalAvgPool2d, self).__init__()
+    def forward(self, inputs):
+        in_size = inputs.size()
+        return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2)
+class SingleGPU(nn.Module):
+    def __init__(self, module):
+        super(SingleGPU, self).__init__()
+        self.module=module
+    def forward(self, input):
+        return self.module(input.cuda(non_blocking=True))

preprocess/humanparsing/modules/residual.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from collections import OrderedDict
+import torch.nn as nn
+from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE
+import torch.nn.functional as functional
+class ResidualBlock(nn.Module):
+    """Configurable residual block
+    Parameters
+    ----------
+    in_channels : int
+        Number of input channels.
+    channels : list of int
+        Number of channels in the internal feature maps. Can either have two or three elements: if three construct
+        a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
+        `3 x 3` then `1 x 1` convolutions.
+    stride : int
+        Stride of the first `3 x 3` convolution
+    dilation : int
+        Dilation to apply to the `3 x 3` convolutions.
+    groups : int
+        Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
+        bottleneck blocks.
+    norm_act : callable
+        Function to create normalization / activation Module.
+    dropout: callable
+        Function to create Dropout Module.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 norm_act=ABN,
+                 dropout=None):
+        super(ResidualBlock, self).__init__()
+        # Check parameters for inconsistencies
+        if len(channels) != 2 and len(channels) != 3:
+            raise ValueError("channels must contain either two or three values")
+        if len(channels) == 2 and groups != 1:
+            raise ValueError("groups > 1 are only valid if len(channels) == 3")
+        is_bottleneck = len(channels) == 3
+        need_proj_conv = stride != 1 or in_channels != channels[-1]
+        if not is_bottleneck:
+            bn2 = norm_act(channels[1])
+            bn2.activation = ACT_NONE
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn1", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn2", bn2)
+            ]
+            if dropout is not None:
+                layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
+        else:
+            bn3 = norm_act(channels[2])
+            bn3.activation = ACT_NONE
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)),
+                ("bn1", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False,
+                                    groups=groups, dilation=dilation)),
+                ("bn2", norm_act(channels[1])),
+                ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)),
+                ("bn3", bn3)
+            ]
+            if dropout is not None:
+                layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
+        self.convs = nn.Sequential(OrderedDict(layers))
+        if need_proj_conv:
+            self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
+            self.proj_bn = norm_act(channels[-1])
+            self.proj_bn.activation = ACT_NONE
+    def forward(self, x):
+        if hasattr(self, "proj_conv"):
+            residual = self.proj_conv(x)
+            residual = self.proj_bn(residual)
+        else:
+            residual = x
+        x = self.convs(x) + residual
+        if self.convs.bn1.activation == ACT_LEAKY_RELU:
+            return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True)
+        elif self.convs.bn1.activation == ACT_ELU:
+            return functional.elu(x, inplace=True)
+        else:
+            return x
+class IdentityResidualBlock(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 norm_act=ABN,
+                 dropout=None):
+        """Configurable identity-mapping residual block
+        Parameters
+        ----------
+        in_channels : int
+            Number of input channels.
+        channels : list of int
+            Number of channels in the internal feature maps. Can either have two or three elements: if three construct
+            a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then
+            `3 x 3` then `1 x 1` convolutions.
+        stride : int
+            Stride of the first `3 x 3` convolution
+        dilation : int
+            Dilation to apply to the `3 x 3` convolutions.
+        groups : int
+            Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with
+            bottleneck blocks.
+        norm_act : callable
+            Function to create normalization / activation Module.
+        dropout: callable
+            Function to create Dropout Module.
+        """
+        super(IdentityResidualBlock, self).__init__()
+        # Check parameters for inconsistencies
+        if len(channels) != 2 and len(channels) != 3:
+            raise ValueError("channels must contain either two or three values")
+        if len(channels) == 2 and groups != 1:
+            raise ValueError("groups > 1 are only valid if len(channels) == 3")
+        is_bottleneck = len(channels) == 3
+        need_proj_conv = stride != 1 or in_channels != channels[-1]
+        self.bn1 = norm_act(in_channels)
+        if not is_bottleneck:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False,
+                                    dilation=dilation)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    dilation=dilation))
+            ]
+            if dropout is not None:
+                layers = layers[0:2] + [("dropout", dropout())] + layers[2:]
+        else:
+            layers = [
+                ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)),
+                ("bn2", norm_act(channels[0])),
+                ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False,
+                                    groups=groups, dilation=dilation)),
+                ("bn3", norm_act(channels[1])),
+                ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False))
+            ]
+            if dropout is not None:
+                layers = layers[0:4] + [("dropout", dropout())] + layers[4:]
+        self.convs = nn.Sequential(OrderedDict(layers))
+        if need_proj_conv:
+            self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False)
+    def forward(self, x):
+        if hasattr(self, "proj_conv"):
+            bn1 = self.bn1(x)
+            shortcut = self.proj_conv(bn1)
+        else:
+            shortcut = x.clone()
+            bn1 = self.bn1(x)
+        out = self.convs(bn1)
+        out.add_(shortcut)
+        return out

preprocess/humanparsing/modules/src/checks.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#pragma once
+#include <ATen/ATen.h>
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

preprocess/humanparsing/modules/src/inplace_abn.cpp ADDED Viewed

	@@ -0,0 +1,95 @@

+#include <torch/extension.h>
+#include <vector>
+#include "inplace_abn.h"
+std::vector<at::Tensor> mean_var(at::Tensor x) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return mean_var_cuda_h(x);
+    } else {
+      return mean_var_cuda(x);
+    }
+  } else {
+    return mean_var_cpu(x);
+  }
+}
+at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                   bool affine, float eps) {
+  if (x.is_cuda()) {
+    if (x.type().scalarType() == at::ScalarType::Half) {
+      return forward_cuda_h(x, mean, var, weight, bias, affine, eps);
+    } else {
+      return forward_cuda(x, mean, var, weight, bias, affine, eps);
+    }
+  } else {
+    return forward_cpu(x, mean, var, weight, bias, affine, eps);
+  }
+}
+std::vector<at::Tensor> edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                 bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps);
+    } else {
+      return edz_eydz_cuda(z, dz, weight, bias, affine, eps);
+	}
+  } else {
+    return edz_eydz_cpu(z, dz, weight, bias, affine, eps);
+  }
+}
+at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                 at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps);
+	} else {
+      return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps);
+    }
+  } else {
+    return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps);
+  }
+}
+void leaky_relu_forward(at::Tensor z, float slope) {
+  at::leaky_relu_(z, slope);
+}
+void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) {
+  if (z.is_cuda()) {
+    if (z.type().scalarType() == at::ScalarType::Half) {
+      return leaky_relu_backward_cuda_h(z, dz, slope);
+	} else {
+      return leaky_relu_backward_cuda(z, dz, slope);
+    }
+  } else {
+    return leaky_relu_backward_cpu(z, dz, slope);
+  }
+}
+void elu_forward(at::Tensor z) {
+  at::elu_(z);
+}
+void elu_backward(at::Tensor z, at::Tensor dz) {
+  if (z.is_cuda()) {
+    return elu_backward_cuda(z, dz);
+  } else {
+    return elu_backward_cpu(z, dz);
+  }
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("mean_var", &mean_var, "Mean and variance computation");
+  m.def("forward", &forward, "In-place forward computation");
+  m.def("edz_eydz", &edz_eydz, "First part of backward computation");
+  m.def("backward", &backward, "Second part of backward computation");
+  m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation");
+  m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion");
+  m.def("elu_forward", &elu_forward, "Elu forward computation");
+  m.def("elu_backward", &elu_backward, "Elu backward computation and inversion");
+}

preprocess/humanparsing/modules/src/inplace_abn.h ADDED Viewed

	@@ -0,0 +1,88 @@

+#pragma once
+#include <ATen/ATen.h>
+#include <vector>
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x);
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x);
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps);
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps);
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                          bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps);
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                        bool affine, float eps);
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                        at::Tensor edz, at::Tensor eydz, bool affine, float eps);
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope);
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope);
+void elu_backward_cpu(at::Tensor z, at::Tensor dz);
+void elu_backward_cuda(at::Tensor z, at::Tensor dz);
+static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) {
+  num = x.size(0);
+  chn = x.size(1);
+  sp = 1;
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    sp *= x.size(i);
+}
+/*
+ * Specialized CUDA reduction functions for BN
+ */
+#ifdef __CUDACC__
+#include "utils/cuda.cuh"
+template <typename T, typename Op>
+__device__ T reduce(Op op, int plane, int N, int S) {
+  T sum = (T)0;
+  for (int batch = 0; batch < N; ++batch) {
+    for (int x = threadIdx.x; x < S; x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    shared[threadIdx.x / WARP_SIZE] = sum;
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T)0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+#endif

preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp ADDED Viewed

	@@ -0,0 +1,119 @@

+#include <ATen/ATen.h>
+#include <vector>
+#include "utils/checks.h"
+#include "inplace_abn.h"
+at::Tensor reduce_sum(at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return x.sum(0);
+  } else {
+    auto x_view = x.view({x.size(0), x.size(1), -1});
+    return x_view.sum(-1).sum(0);
+  }
+}
+at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return v;
+  } else {
+    std::vector<int64_t> broadcast_size = {1, -1};
+    for (int64_t i = 2; i < x.ndimension(); ++i)
+      broadcast_size.push_back(1);
+    return v.view(broadcast_size);
+  }
+}
+int64_t count(at::Tensor x) {
+  int64_t count = x.size(0);
+  for (int64_t i = 2; i < x.ndimension(); ++i)
+    count *= x.size(i);
+  return count;
+}
+at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) {
+  if (affine) {
+    return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z);
+  } else {
+    return z;
+  }
+}
+std::vector<at::Tensor> mean_var_cpu(at::Tensor x) {
+  auto num = count(x);
+  auto mean = reduce_sum(x) / num;
+  auto diff = x - broadcast_to(mean, x);
+  auto var = reduce_sum(diff.pow(2)) / num;
+  return {mean, var};
+}
+at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                       bool affine, float eps) {
+  auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var);
+  auto mul = at::rsqrt(var + eps) * gamma;
+  x.sub_(broadcast_to(mean, x));
+  x.mul_(broadcast_to(mul, x));
+  if (affine) x.add_(broadcast_to(bias, x));
+  return x;
+}
+std::vector<at::Tensor> edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                     bool affine, float eps) {
+  auto edz = reduce_sum(dz);
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto eydz = reduce_sum(y * dz);
+  return {edz, eydz};
+}
+at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                     at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  auto y = invert_affine(z, weight, bias, affine, eps);
+  auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps);
+  auto num = count(z);
+  auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz);
+  return dx;
+}
+void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] *= 1 / slope;
+        _dz[i] *= slope;
+      }
+    }
+  }));
+}
+void elu_backward_cpu(at::Tensor z, at::Tensor dz) {
+  CHECK_CPU_INPUT(z);
+  CHECK_CPU_INPUT(dz);
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] {
+    int64_t count = z.numel();
+    auto *_z = z.data<scalar_t>();
+    auto *_dz = dz.data<scalar_t>();
+    for (int64_t i = 0; i < count; ++i) {
+      if (_z[i] < 0) {
+        _z[i] = log1p(_z[i]);
+        _dz[i] *= (_z[i] + 1.f);
+      }
+    }
+  }));
+}

preprocess/humanparsing/modules/src/inplace_abn_cuda.cu ADDED Viewed

	@@ -0,0 +1,333 @@

+#include <ATen/ATen.h>
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+#include <vector>
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+#include <ATen/cuda/CUDAContext.h>
+// Operations for reduce
+template<typename T>
+struct SumOp {
+  __device__ SumOp(const T *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    return tensor[(batch * chn + plane) * sp + n];
+  }
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+template<typename T>
+struct VarOp {
+  __device__ VarOp(T m, const T *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ T operator()(int batch, int plane, int n) {
+    T val = tensor[(batch * chn + plane) * sp + n];
+    return (val - mean) * (val - mean);
+  }
+  const T mean;
+  const T *tensor;
+  const int chn;
+  const int sp;
+};
+template<typename T>
+struct GradOp {
+  __device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<T> operator()(int batch, int plane, int n) {
+    T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight;
+    T _dz = dz[(batch * chn + plane) * sp + n];
+    return Pair<T>(_dz, _y * _dz);
+  }
+  const T weight;
+  const T bias;
+  const T *z;
+  const T *dz;
+  const int chn;
+  const int sp;
+};
+/***********
+ * mean_var
+ ***********/
+template<typename T>
+__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T norm = T(1) / T(num * sp);
+  T _mean = reduce<T, SumOp<T>>(SumOp<T>(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  T _var = reduce<T, VarOp<T>>(VarOp<T>(_mean, x, chn, sp), plane, num, sp) * norm;
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+std::vector<at::Tensor> mean_var_cuda(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Prepare output tensors
+  auto mean = at::empty({chn}, x.options());
+  auto var = at::empty({chn}, x.options());
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] {
+    mean_var_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        num, chn, sp);
+  }));
+  return {mean, var};
+}
+/**********
+ * forward
+ **********/
+template<typename T>
+__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias,
+                               bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _mean = mean[plane];
+  T _var = var[plane];
+  T _weight = affine ? abs(weight[plane]) + eps : T(1);
+  T _bias = affine ? bias[plane] : T(0);
+  T mul = rsqrt(_var + eps) * _weight;
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _x = x[(batch * chn + plane) * sp + n];
+      T _y = (_x - _mean) * mul + _bias;
+      x[(batch * chn + plane) * sp + n] = _y;
+    }
+  }
+}
+at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] {
+    forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        x.data<scalar_t>(),
+        mean.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return x;
+}
+/***********
+ * edz_eydz
+ ***********/
+template<typename T>
+__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias,
+                                T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+  Pair<T> res = reduce<Pair<T>, GradOp<T>>(GradOp<T>(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+std::vector<at::Tensor> edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto edz = at::empty({chn}, z.options());
+  auto eydz = at::empty({chn}, z.options());
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] {
+    edz_eydz_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return {edz, eydz};
+}
+/***********
+ * backward
+ ***********/
+template<typename T>
+__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz,
+	                        const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  T _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  T _bias = affine ? bias[plane] : 0.f;
+  T _var = var[plane];
+  T _edz = edz[plane];
+  T _eydz = eydz[plane];
+  T _mul = _weight * rsqrt(_var + eps);
+  T count = T(num * sp);
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      T _dz = dz[(batch * chn + plane) * sp + n];
+      T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight;
+      dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul;
+    }
+  }
+}
+at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto dx = at::zeros_like(z);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] {
+    backward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        z.data<scalar_t>(),
+        dz.data<scalar_t>(),
+        var.data<scalar_t>(),
+        weight.data<scalar_t>(),
+        bias.data<scalar_t>(),
+        edz.data<scalar_t>(),
+        eydz.data<scalar_t>(),
+        dx.data<scalar_t>(),
+        affine, eps, num, chn, sp);
+  }));
+  return dx;
+}
+/**************
+ * activations
+ **************/
+template<typename T>
+inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_dz,
+                       [slope] __device__ (const T& dz) { return dz * slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [slope] __device__ (const T& z) { return z / slope; },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    leaky_relu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), slope, count);
+  }));
+}
+template<typename T>
+inline void elu_backward_impl(T *z, T *dz, int64_t count) {
+  // Create thrust pointers
+  thrust::device_ptr<T> th_z = thrust::device_pointer_cast(z);
+  thrust::device_ptr<T> th_dz = thrust::device_pointer_cast(dz);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_dz, th_dz + count, th_z, th_z, th_dz,
+                       [] __device__ (const T& dz, const T& z) { return dz * (z + 1.); },
+                       [] __device__ (const T& z) { return z < 0; });
+  thrust::transform_if(thrust::cuda::par.on(stream),
+                       th_z, th_z + count, th_z,
+                       [] __device__ (const T& z) { return log1p(z); },
+                       [] __device__ (const T& z) { return z < 0; });
+}
+void elu_backward_cuda(at::Tensor z, at::Tensor dz) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] {
+    elu_backward_impl<scalar_t>(z.data<scalar_t>(), dz.data<scalar_t>(), count);
+  }));
+}

preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu ADDED Viewed

	@@ -0,0 +1,275 @@

+#include <ATen/ATen.h>
+#include <cuda_fp16.h>
+#include <vector>
+#include "utils/checks.h"
+#include "utils/cuda.cuh"
+#include "inplace_abn.h"
+#include <ATen/cuda/CUDAContext.h>
+// Operations for reduce
+struct SumOpH {
+  __device__ SumOpH(const half *t, int c, int s)
+      : tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    return __half2float(tensor[(batch * chn + plane) * sp + n]);
+  }
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+struct VarOpH {
+  __device__ VarOpH(float m, const half *t, int c, int s)
+      : mean(m), tensor(t), chn(c), sp(s) {}
+  __device__ __forceinline__ float operator()(int batch, int plane, int n) {
+    const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]);
+    return (t - mean) * (t - mean);
+  }
+  const float mean;
+  const half *tensor;
+  const int chn;
+  const int sp;
+};
+struct GradOpH {
+  __device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s)
+      : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {}
+  __device__ __forceinline__ Pair<float> operator()(int batch, int plane, int n) {
+    float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight;
+    float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+    return Pair<float>(_dz, _y * _dz);
+  }
+  const float weight;
+  const float bias;
+  const half *z;
+  const half *dz;
+  const int chn;
+  const int sp;
+};
+/***********
+ * mean_var
+ ***********/
+__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float norm = 1.f / static_cast<float>(num * sp);
+  float _mean = reduce<float, SumOpH>(SumOpH(x, chn, sp), plane, num, sp) * norm;
+  __syncthreads();
+  float _var = reduce<float, VarOpH>(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm;
+  if (threadIdx.x == 0) {
+    mean[plane] = _mean;
+    var[plane] = _var;
+  }
+}
+std::vector<at::Tensor> mean_var_cuda_h(at::Tensor x) {
+  CHECK_CUDA_INPUT(x);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Prepare output tensors
+  auto mean = at::empty({chn},x.options().dtype(at::kFloat));
+  auto var = at::empty({chn},x.options().dtype(at::kFloat));
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  mean_var_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      num, chn, sp);
+  return {mean, var};
+}
+/**********
+ * forward
+ **********/
+__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias,
+                                 bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  const float _mean = mean[plane];
+  const float _var = var[plane];
+  const float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  const float _bias = affine ? bias[plane] : 0.f;
+  const float mul = rsqrt(_var + eps) * _weight;
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      half *x_ptr = x + (batch * chn + plane) * sp + n;
+      float _x = __half2float(*x_ptr);
+      float _y = (_x - _mean) * mul + _bias;
+      *x_ptr = __float2half(_y);
+    }
+  }
+}
+at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                        bool affine, float eps) {
+  CHECK_CUDA_INPUT(x);
+  CHECK_CUDA_INPUT(mean);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(x, num, chn, sp);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  forward_kernel_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(x.data<at::Half>()),
+      mean.data<float>(),
+      var.data<float>(),
+      weight.data<float>(),
+      bias.data<float>(),
+      affine, eps, num, chn, sp);
+  return x;
+}
+__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias,
+                                float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+  Pair<float> res = reduce<Pair<float>, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    edz[plane] = res.v1;
+    eydz[plane] = res.v2;
+  }
+}
+std::vector<at::Tensor> edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias,
+                                      bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto edz = at::empty({chn},z.options().dtype(at::kFloat));
+  auto eydz = at::empty({chn},z.options().dtype(at::kFloat));
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  edz_eydz_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        affine, eps, num, chn, sp);
+  return {edz, eydz};
+}
+__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz,
+                                  const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) {
+  int plane = blockIdx.x;
+  float _weight = affine ? abs(weight[plane]) + eps : 1.f;
+  float _bias = affine ? bias[plane] : 0.f;
+  float _var = var[plane];
+  float _edz = edz[plane];
+  float _eydz = eydz[plane];
+  float _mul = _weight * rsqrt(_var + eps);
+  float count = float(num * sp);
+  for (int batch = 0; batch < num; ++batch) {
+    for (int n = threadIdx.x; n < sp; n += blockDim.x) {
+      float _dz = __half2float(dz[(batch * chn + plane) * sp + n]);
+      float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight;
+      dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul);
+    }
+  }
+}
+at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias,
+                                      at::Tensor edz, at::Tensor eydz, bool affine, float eps) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  CHECK_CUDA_INPUT(var);
+  CHECK_CUDA_INPUT(weight);
+  CHECK_CUDA_INPUT(bias);
+  CHECK_CUDA_INPUT(edz);
+  CHECK_CUDA_INPUT(eydz);
+  // Extract dimensions
+  int64_t num, chn, sp;
+  get_dims(z, num, chn, sp);
+  auto dx = at::zeros_like(z);
+  // Run kernel
+  dim3 blocks(chn);
+  dim3 threads(getNumThreads(sp));
+  auto stream = at::cuda::getCurrentCUDAStream();
+  backward_kernel_h<<<blocks, threads, 0, stream>>>(
+        reinterpret_cast<half*>(z.data<at::Half>()),
+        reinterpret_cast<half*>(dz.data<at::Half>()),
+        var.data<float>(),
+        weight.data<float>(),
+        bias.data<float>(),
+        edz.data<float>(),
+        eydz.data<float>(),
+        reinterpret_cast<half*>(dx.data<at::Half>()),
+        affine, eps, num, chn, sp);
+  return dx;
+}
+__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count;  i += blockDim.x * gridDim.x){
+    float _z = __half2float(z[i]);
+    if (_z < 0) {
+      dz[i] = __float2half(__half2float(dz[i]) * slope);
+      z[i] = __float2half(_z / slope);
+    }
+  }
+}
+void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) {
+  CHECK_CUDA_INPUT(z);
+  CHECK_CUDA_INPUT(dz);
+  int64_t count = z.numel();
+  dim3 threads(getNumThreads(count));
+  dim3 blocks = (count + threads.x - 1) / threads.x;
+  auto stream = at::cuda::getCurrentCUDAStream();
+  leaky_relu_backward_impl_h<<<blocks, threads, 0, stream>>>(
+      reinterpret_cast<half*>(z.data<at::Half>()),
+      reinterpret_cast<half*>(dz.data<at::Half>()),
+      slope, count);
+}

preprocess/humanparsing/modules/src/utils/checks.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#pragma once
+#include <ATen/ATen.h>
+// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT
+#ifndef AT_CHECK
+#define AT_CHECK AT_ASSERT
+#endif
+#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x)

preprocess/humanparsing/modules/src/utils/common.h ADDED Viewed

	@@ -0,0 +1,49 @@

+#pragma once
+#include <ATen/ATen.h>
+/*
+ * Functions to share code between CPU and GPU
+ */
+#ifdef __CUDACC__
+// CUDA versions
+#define HOST_DEVICE __host__ __device__
+#define INLINE_HOST_DEVICE __host__ __device__ inline
+#define FLOOR(x) floor(x)
+#if __CUDA_ARCH__ >= 600
+// Recent compute capabilities have block-level atomicAdd for all data types, so we use that
+#define ACCUM(x,y) atomicAdd_block(&(x),(y))
+#else
+// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float
+// and use the known atomicCAS-based implementation for double
+template<typename data_t>
+__device__ inline data_t atomic_add(data_t *address, data_t val) {
+  return atomicAdd(address, val);
+}
+template<>
+__device__ inline double atomic_add(double *address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+#define ACCUM(x,y) atomic_add(&(x),(y))
+#endif // #if __CUDA_ARCH__ >= 600
+#else
+// CPU versions
+#define HOST_DEVICE
+#define INLINE_HOST_DEVICE inline
+#define FLOOR(x) std::floor(x)
+#define ACCUM(x,y) (x) += (y)
+#endif // #ifdef __CUDACC__

preprocess/humanparsing/modules/src/utils/cuda.cuh ADDED Viewed

	@@ -0,0 +1,71 @@

+#pragma once
+/*
+ * General settings and functions
+ */
+const int WARP_SIZE = 32;
+const int MAX_BLOCK_SIZE = 1024;
+static int getNumThreads(int nElem) {
+  int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE};
+  for (int i = 0; i < 6; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+/*
+ * Reduction utilities
+ */
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize,
+                                           unsigned int mask = 0xffffffff) {
+#if CUDART_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); }
+template<typename T>
+struct Pair {
+  T v1, v2;
+  __device__ Pair() {}
+  __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {}
+  __device__ Pair(T v) : v1(v), v2(v) {}
+  __device__ Pair(int v) : v1(v), v2(v) {}
+  __device__ Pair &operator+=(const Pair<T> &a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+};
+template<typename T>
+static __device__ __forceinline__ T warpSum(T val) {
+#if __CUDA_ARCH__ >= 300
+  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+    val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
+  }
+#else
+  __shared__ T values[MAX_BLOCK_SIZE];
+  values[threadIdx.x] = val;
+  __threadfence_block();
+  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+  for (int i = 1; i < WARP_SIZE; i++) {
+    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+  }
+#endif
+  return val;
+}
+template<typename T>
+static __device__ __forceinline__ Pair<T> warpSum(Pair<T> value) {
+  value.v1 = warpSum(value.v1);
+  value.v2 = warpSum(value.v2);
+  return value;
+}

preprocess/humanparsing/networks/AugmentCE2P.py ADDED Viewed

	@@ -0,0 +1,388 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   AugmentCE2P.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import pdb
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+# Note here we adopt the InplaceABNSync implementation from https://github.com/mapillary/inplace_abn
+# By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer
+from modules import InPlaceABNSync
+import numpy as np
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+affine_par = True
+pretrained_settings = {
+    'resnet101': {
+        'imagenet': {
+            'input_space': 'BGR',
+            'input_size': [3, 224, 224],
+            'input_range': [0, 1],
+            'mean': [0.406, 0.456, 0.485],
+            'std': [0.225, 0.224, 0.229],
+            'num_classes': 1000
+        }
+    },
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out = out + residual
+        out = self.relu_inplace(out)
+        return out
+class CostomAdaptiveAvgPool2D(nn.Module):
+    def __init__(self, output_size):
+        super(CostomAdaptiveAvgPool2D, self).__init__()
+        self.output_size = output_size
+    def forward(self, x):
+        H_in, W_in = x.shape[-2:]
+        H_out, W_out = self.output_size
+        out_i = []
+        for i in range(H_out):
+            out_j = []
+            for j in range(W_out):
+                hs = int(np.floor(i * H_in / H_out))
+                he = int(np.ceil((i + 1) * H_in / H_out))
+                ws = int(np.floor(j * W_in / W_out))
+                we = int(np.ceil((j + 1) * W_in / W_out))
+                # print(hs, he, ws, we)
+                kernel_size = [he - hs, we - ws]
+                out = F.avg_pool2d(x[:, :, hs:he, ws:we], kernel_size)
+                out_j.append(out)
+            out_j = torch.concat(out_j, -1)
+            out_i.append(out_j)
+        out_i = torch.concat(out_i, -2)
+        return out_i
+class PSPModule(nn.Module):
+    """
+    Reference:
+        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
+    """
+    def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
+        super(PSPModule, self).__init__()
+        self.stages = []
+        tmp = []
+        for size in sizes:
+            if size == 3 or size == 6:
+                tmp.append(self._make_stage_custom(features, out_features, size))
+            else:
+                tmp.append(self._make_stage(features, out_features, size))
+        self.stages = nn.ModuleList(tmp)
+        # self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
+                      bias=False),
+            InPlaceABNSync(out_features),
+        )
+    def _make_stage(self, features, out_features, size):
+        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+    def _make_stage_custom(self, features, out_features, size):
+        prior = CostomAdaptiveAvgPool2D(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
+                  self.stages] + [feats]
+        bottle = self.bottleneck(torch.cat(priors, 1))
+        return bottle
+class ASPPModule(nn.Module):
+    """
+    Reference:
+        Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
+    """
+    def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                   nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
+                                             bias=False),
+                                   InPlaceABNSync(inner_features))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(inner_features))
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+        bottle = self.bottleneck(out)
+        return bottle
+class Edge_Module(nn.Module):
+    """
+    Edge Learning Branch
+    """
+    def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2):
+        super(Edge_Module, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(mid_fea)
+        )
+        self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
+        self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
+    def forward(self, x1, x2, x3):
+        _, _, h, w = x1.size()
+        edge1_fea = self.conv1(x1)
+        edge1 = self.conv4(edge1_fea)
+        edge2_fea = self.conv2(x2)
+        edge2 = self.conv4(edge2_fea)
+        edge3_fea = self.conv3(x3)
+        edge3 = self.conv4(edge3_fea)
+        edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True)
+        edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True)
+        edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True)
+        edge = torch.cat([edge1, edge2, edge3], dim=1)
+        edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
+        edge = self.conv5(edge)
+        return edge, edge_fea
+class Decoder_Module(nn.Module):
+    """
+    Parsing Branch Decoder Module.
+    """
+    def __init__(self, num_classes):
+        super(Decoder_Module, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(48)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256)
+        )
+        self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+    def forward(self, xt, xl):
+        _, _, h, w = xl.size()
+        xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
+        xl = self.conv2(xl)
+        x = torch.cat([xt, xl], dim=1)
+        x = self.conv3(x)
+        seg = self.conv4(x)
+        return seg, x
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=False)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=False)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=False)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1))
+        self.context_encoding = PSPModule(2048, 512)
+        self.edge = Edge_Module()
+        self.decoder = Decoder_Module(num_classes)
+        self.fushion = nn.Sequential(
+            nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(256),
+            nn.Dropout2d(0.1),
+            nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+        )
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion, affine=affine_par))
+        layers = []
+        generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1
+        layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample,
+                            multi_grid=generate_multi_grid(0, multi_grid)))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x2 = self.layer1(x)
+        x3 = self.layer2(x2)
+        x4 = self.layer3(x3)
+        x5 = self.layer4(x4)
+        x = self.context_encoding(x5)
+        parsing_result, parsing_fea = self.decoder(x, x2)
+        # Edge Branch
+        edge_result, edge_fea = self.edge(x2, x3, x4)
+        # Fusion Branch
+        x = torch.cat([parsing_fea, edge_fea], dim=1)
+        fusion_result = self.fushion(x)
+        return [[parsing_result, fusion_result], edge_result]
+def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'):
+    model.input_space = settings['input_space']
+    model.input_size = settings['input_size']
+    model.input_range = settings['input_range']
+    model.mean = settings['mean']
+    model.std = settings['std']
+    if pretrained is not None:
+        saved_state_dict = torch.load(pretrained)
+        new_params = model.state_dict().copy()
+        for i in saved_state_dict:
+            i_parts = i.split('.')
+            if not i_parts[0] == 'fc':
+                new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
+        model.load_state_dict(new_params)
+def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'):
+    model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes)
+    settings = pretrained_settings['resnet101']['imagenet']
+    initialize_pretrained_model(model, settings, pretrained)
+    return model

preprocess/humanparsing/networks/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from __future__ import absolute_import
+from networks.AugmentCE2P import resnet101
+__factory = {
+    'resnet101': resnet101,
+}
+def init_model(name, *args, **kwargs):
+    if name not in __factory.keys():
+        raise KeyError("Unknown model arch: {}".format(name))
+    return __factory[name](*args, **kwargs)

preprocess/humanparsing/networks/backbone/mobilenetv2.py ADDED Viewed

	@@ -0,0 +1,156 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   mobilenetv2.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch.nn as nn
+import math
+import functools
+from modules import InPlaceABN, InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+__all__ = ['mobilenetv2']
+def conv_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+        BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+def conv_1x1_bn(inp, oup):
+    return nn.Sequential(
+        nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+        BatchNorm2d(oup),
+        nn.ReLU6(inplace=True)
+    )
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+        if expand_ratio == 1:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                BatchNorm2d(oup),
+            )
+        else:
+            self.conv = nn.Sequential(
+                # pw
+                nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # dw
+                nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                BatchNorm2d(hidden_dim),
+                nn.ReLU6(inplace=True),
+                # pw-linear
+                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                BatchNorm2d(oup),
+            )
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+class MobileNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=1.):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],  # layer 2
+            [6, 32, 3, 2],  # layer 3
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],  # layer 4
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],  # layer 5
+        ]
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(block(input_channel, output_channel, s, expand_ratio=t))
+                else:
+                    self.features.append(block(input_channel, output_channel, 1, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2),
+            nn.Linear(self.last_channel, n_class),
+        )
+        self._initialize_weights()
+    def forward(self, x):
+        x = self.features(x)
+        x = x.mean(3).mean(2)
+        x = self.classifier(x)
+        return x
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
+def mobilenetv2(pretrained=False, **kwargs):
+    """Constructs a MobileNet_V2 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = MobileNetV2(n_class=1000, **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['mobilenetv2']), strict=False)
+    return model

preprocess/humanparsing/networks/backbone/resnet.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   resnet.py
+@Time    :   8/4/19 3:35 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch.nn as nn
+import math
+from torch.utils.model_zoo import load_url
+from modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+__all__ = ['ResNet', 'resnet18', 'resnet50', 'resnet101']  # resnet101 is coming soon!
+model_urls = {
+    'resnet18': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet18-imagenet.pth',
+    'resnet50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet50-imagenet.pth',
+    'resnet101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet101-imagenet.pth'
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 128
+        super(ResNet, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+def resnet18(pretrained=False, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet18']))
+    return model
+def resnet50(pretrained=False, **kwargs):
+    """Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet50']), strict=False)
+    return model
+def resnet101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnet101']), strict=False)
+    return model

preprocess/humanparsing/networks/backbone/resnext.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   resnext.py.py
+@Time    :   8/11/19 8:58 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch.nn as nn
+import math
+from torch.utils.model_zoo import load_url
+from modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+__all__ = ['ResNeXt', 'resnext101']  # support resnext 101
+model_urls = {
+    'resnext50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext50-imagenet.pth',
+    'resnext101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext101-imagenet.pth'
+}
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class GroupBottleneck(nn.Module):
+    expansion = 2
+    def __init__(self, inplanes, planes, stride=1, groups=1, downsample=None):
+        super(GroupBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, groups=groups, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 2, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 2)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+class ResNeXt(nn.Module):
+    def __init__(self, block, layers, groups=32, num_classes=1000):
+        self.inplanes = 128
+        super(ResNeXt, self).__init__()
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = BatchNorm2d(64)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = BatchNorm2d(128)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 128, layers[0], groups=groups)
+        self.layer2 = self._make_layer(block, 256, layers[1], stride=2, groups=groups)
+        self.layer3 = self._make_layer(block, 512, layers[2], stride=2, groups=groups)
+        self.layer4 = self._make_layer(block, 1024, layers[3], stride=2, groups=groups)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(1024 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels // m.groups
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+    def _make_layer(self, block, planes, blocks, stride=1, groups=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, groups, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=groups))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+        return x
+def resnext101(pretrained=False, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on Places
+    """
+    model = ResNeXt(GroupBottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(load_url(model_urls['resnext101']), strict=False)
+    return model

preprocess/humanparsing/networks/context_encoding/aspp.py ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   aspp.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from modules import InPlaceABNSync
+class ASPPModule(nn.Module):
+    """
+    Reference:
+        Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."*
+    """
+    def __init__(self, features, out_features=512, inner_features=256, dilations=(12, 24, 36)):
+        super(ASPPModule, self).__init__()
+        self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                   nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1,
+                                             bias=False),
+                                   InPlaceABNSync(inner_features))
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(inner_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(inner_features))
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+    def forward(self, x):
+        _, _, h, w = x.size()
+        feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+        bottle = self.bottleneck(out)
+        return bottle

preprocess/humanparsing/networks/context_encoding/ocnet.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   ocnet.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import functools
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+from torch.nn import functional as F
+from modules import InPlaceABNSync
+BatchNorm2d = functools.partial(InPlaceABNSync, activation='none')
+class _SelfAttentionBlock(nn.Module):
+    '''
+    The basic implementation for self-attention block/non-local block
+    Input:
+        N X C X H X W
+    Parameters:
+        in_channels       : the dimension of the input feature map
+        key_channels      : the dimension after the key/query transform
+        value_channels    : the dimension after the value transform
+        scale             : choose the scale to downsample the input feature maps (save memory cost)
+    Return:
+        N X C X H X W
+        position-aware context features.(w/o concate or add with the input)
+    '''
+    def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
+        super(_SelfAttentionBlock, self).__init__()
+        self.scale = scale
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.key_channels = key_channels
+        self.value_channels = value_channels
+        if out_channels == None:
+            self.out_channels = in_channels
+        self.pool = nn.MaxPool2d(kernel_size=(scale, scale))
+        self.f_key = nn.Sequential(
+            nn.Conv2d(in_channels=self.in_channels, out_channels=self.key_channels,
+                      kernel_size=1, stride=1, padding=0),
+            InPlaceABNSync(self.key_channels),
+        )
+        self.f_query = self.f_key
+        self.f_value = nn.Conv2d(in_channels=self.in_channels, out_channels=self.value_channels,
+                                 kernel_size=1, stride=1, padding=0)
+        self.W = nn.Conv2d(in_channels=self.value_channels, out_channels=self.out_channels,
+                           kernel_size=1, stride=1, padding=0)
+        nn.init.constant(self.W.weight, 0)
+        nn.init.constant(self.W.bias, 0)
+    def forward(self, x):
+        batch_size, h, w = x.size(0), x.size(2), x.size(3)
+        if self.scale > 1:
+            x = self.pool(x)
+        value = self.f_value(x).view(batch_size, self.value_channels, -1)
+        value = value.permute(0, 2, 1)
+        query = self.f_query(x).view(batch_size, self.key_channels, -1)
+        query = query.permute(0, 2, 1)
+        key = self.f_key(x).view(batch_size, self.key_channels, -1)
+        sim_map = torch.matmul(query, key)
+        sim_map = (self.key_channels ** -.5) * sim_map
+        sim_map = F.softmax(sim_map, dim=-1)
+        context = torch.matmul(sim_map, value)
+        context = context.permute(0, 2, 1).contiguous()
+        context = context.view(batch_size, self.value_channels, *x.size()[2:])
+        context = self.W(context)
+        if self.scale > 1:
+            context = F.upsample(input=context, size=(h, w), mode='bilinear', align_corners=True)
+        return context
+class SelfAttentionBlock2D(_SelfAttentionBlock):
+    def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1):
+        super(SelfAttentionBlock2D, self).__init__(in_channels,
+                                                   key_channels,
+                                                   value_channels,
+                                                   out_channels,
+                                                   scale)
+class BaseOC_Module(nn.Module):
+    """
+    Implementation of the BaseOC module
+    Parameters:
+        in_features / out_features: the channels of the input / output feature maps.
+        dropout: we choose 0.05 as the default value.
+        size: you can apply multiple sizes. Here we only use one size.
+    Return:
+        features fused with Object context information.
+    """
+    def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
+        super(BaseOC_Module, self).__init__()
+        self.stages = []
+        self.stages = nn.ModuleList(
+            [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
+        self.conv_bn_dropout = nn.Sequential(
+            nn.Conv2d(2 * in_channels, out_channels, kernel_size=1, padding=0),
+            InPlaceABNSync(out_channels),
+            nn.Dropout2d(dropout)
+        )
+    def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
+        return SelfAttentionBlock2D(in_channels,
+                                    key_channels,
+                                    value_channels,
+                                    output_channels,
+                                    size)
+    def forward(self, feats):
+        priors = [stage(feats) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+        output = self.conv_bn_dropout(torch.cat([context, feats], 1))
+        return output
+class BaseOC_Context_Module(nn.Module):
+    """
+    Output only the context features.
+    Parameters:
+        in_features / out_features: the channels of the input / output feature maps.
+        dropout: specify the dropout ratio
+        fusion: We provide two different fusion method, "concat" or "add"
+        size: we find that directly learn the attention weights on even 1/8 feature maps is hard.
+    Return:
+        features after "concat" or "add"
+    """
+    def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])):
+        super(BaseOC_Context_Module, self).__init__()
+        self.stages = []
+        self.stages = nn.ModuleList(
+            [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes])
+        self.conv_bn_dropout = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0),
+            InPlaceABNSync(out_channels),
+        )
+    def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size):
+        return SelfAttentionBlock2D(in_channels,
+                                    key_channels,
+                                    value_channels,
+                                    output_channels,
+                                    size)
+    def forward(self, feats):
+        priors = [stage(feats) for stage in self.stages]
+        context = priors[0]
+        for i in range(1, len(priors)):
+            context += priors[i]
+        output = self.conv_bn_dropout(context)
+        return output
+class ASP_OC_Module(nn.Module):
+    def __init__(self, features, out_features=256, dilations=(12, 24, 36)):
+        super(ASP_OC_Module, self).__init__()
+        self.context = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=3, padding=1, dilation=1, bias=True),
+                                     InPlaceABNSync(out_features),
+                                     BaseOC_Context_Module(in_channels=out_features, out_channels=out_features,
+                                                           key_channels=out_features // 2, value_channels=out_features,
+                                                           dropout=0, sizes=([2])))
+        self.conv2 = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+                                   InPlaceABNSync(out_features))
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False),
+            InPlaceABNSync(out_features))
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False),
+            InPlaceABNSync(out_features))
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False),
+            InPlaceABNSync(out_features))
+        self.conv_bn_dropout = nn.Sequential(
+            nn.Conv2d(out_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False),
+            InPlaceABNSync(out_features),
+            nn.Dropout2d(0.1)
+        )
+    def _cat_each(self, feat1, feat2, feat3, feat4, feat5):
+        assert (len(feat1) == len(feat2))
+        z = []
+        for i in range(len(feat1)):
+            z.append(torch.cat((feat1[i], feat2[i], feat3[i], feat4[i], feat5[i]), 1))
+        return z
+    def forward(self, x):
+        if isinstance(x, Variable):
+            _, _, h, w = x.size()
+        elif isinstance(x, tuple) or isinstance(x, list):
+            _, _, h, w = x[0].size()
+        else:
+            raise RuntimeError('unknown input type')
+        feat1 = self.context(x)
+        feat2 = self.conv2(x)
+        feat3 = self.conv3(x)
+        feat4 = self.conv4(x)
+        feat5 = self.conv5(x)
+        if isinstance(x, Variable):
+            out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1)
+        elif isinstance(x, tuple) or isinstance(x, list):
+            out = self._cat_each(feat1, feat2, feat3, feat4, feat5)
+        else:
+            raise RuntimeError('unknown input type')
+        output = self.conv_bn_dropout(out)
+        return output

preprocess/humanparsing/networks/context_encoding/psp.py ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   psp.py
+@Time    :   8/4/19 3:36 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from modules import InPlaceABNSync
+class PSPModule(nn.Module):
+    """
+    Reference:
+        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
+    """
+    def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)):
+        super(PSPModule, self).__init__()
+        self.stages = []
+        self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes])
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1,
+                      bias=False),
+            InPlaceABNSync(out_features),
+        )
+    def _make_stage(self, features, out_features, size):
+        prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False)
+        bn = InPlaceABNSync(out_features)
+        return nn.Sequential(prior, conv, bn)
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in
+                  self.stages] + [feats]
+        bottle = self.bottleneck(torch.cat(priors, 1))
+        return bottle

preprocess/humanparsing/parsing_api.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import sys
+from pathlib import Path
+PROJECT_ROOT = Path(__file__).absolute().parents[0].absolute()
+sys.path.insert(0, str(PROJECT_ROOT))
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from datasets.simple_extractor_dataset import SimpleFolderDataset
+from PIL import Image
+from utils.transforms import transform_logits
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Args:
+        num_cls: Number of classes
+    Returns:
+        The color map
+    """
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+def delete_irregular(logits_result):
+    parsing_result = np.argmax(logits_result, axis=2)
+    upper_cloth = np.where(parsing_result == 4, 255, 0)
+    contours, hierarchy = cv2.findContours(upper_cloth.astype(np.uint8),
+                                           cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
+    area = []
+    for i in range(len(contours)):
+        a = cv2.contourArea(contours[i], True)
+        area.append(abs(a))
+    if len(area) != 0:
+        top = area.index(max(area))
+        M = cv2.moments(contours[top])
+        cY = int(M["m01"] / M["m00"])
+    dresses = np.where(parsing_result == 7, 255, 0)
+    contours_dress, hierarchy_dress = cv2.findContours(dresses.astype(np.uint8),
+                                                       cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
+    area_dress = []
+    for j in range(len(contours_dress)):
+        a_d = cv2.contourArea(contours_dress[j], True)
+        area_dress.append(abs(a_d))
+    if len(area_dress) != 0:
+        top_dress = area_dress.index(max(area_dress))
+        M_dress = cv2.moments(contours_dress[top_dress])
+        cY_dress = int(M_dress["m01"] / M_dress["m00"])
+    wear_type = "dresses"
+    if len(area) != 0:
+        if len(area_dress) != 0 and cY_dress > cY:
+            irregular_list = np.array([4, 5, 6])
+            logits_result[:, :, irregular_list] = -1
+        else:
+            irregular_list = np.array([5, 6, 7, 8, 9, 10, 12, 13])
+            logits_result[:cY, :, irregular_list] = -1
+            wear_type = "cloth_pant"
+        parsing_result = np.argmax(logits_result, axis=2)
+    # pad border
+    parsing_result = np.pad(parsing_result, pad_width=1, mode='constant', constant_values=0)
+    return parsing_result, wear_type
+def hole_fill(img):
+    img_copy = img.copy()
+    mask = np.zeros((img.shape[0] + 2, img.shape[1] + 2), dtype=np.uint8)
+    cv2.floodFill(img, mask, (0, 0), 255)
+    img_inverse = cv2.bitwise_not(img)
+    dst = cv2.bitwise_or(img_copy, img_inverse)
+    return dst
+def refine_mask(mask):
+    contours, hierarchy = cv2.findContours(mask.astype(np.uint8),
+                                           cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
+    area = []
+    for j in range(len(contours)):
+        a_d = cv2.contourArea(contours[j], True)
+        area.append(abs(a_d))
+    refine_mask = np.zeros_like(mask).astype(np.uint8)
+    if len(area) != 0:
+        i = area.index(max(area))
+        cv2.drawContours(refine_mask, contours, i, color=255, thickness=-1)
+        # keep large area in skin case
+        for j in range(len(area)):
+            if j != i and area[i] > 2000:
+                cv2.drawContours(refine_mask, contours, j, color=255, thickness=-1)
+    return refine_mask
+def refine_hole(parsing_result_filled, parsing_result, arm_mask):
+    filled_hole = cv2.bitwise_and(np.where(parsing_result_filled == 4, 255, 0),
+                                  np.where(parsing_result != 4, 255, 0)) - arm_mask * 255
+    contours, hierarchy = cv2.findContours(filled_hole, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1)
+    refine_hole_mask = np.zeros_like(parsing_result).astype(np.uint8)
+    for i in range(len(contours)):
+        a = cv2.contourArea(contours[i], True)
+        # keep hole > 2000 pixels
+        if abs(a) > 2000:
+            cv2.drawContours(refine_hole_mask, contours, i, color=255, thickness=-1)
+    return refine_hole_mask + arm_mask
+def onnx_inference(session, lip_session, input_dir):
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.406, 0.456, 0.485], std=[0.225, 0.224, 0.229])
+    ])
+    dataset = SimpleFolderDataset(root=input_dir, input_size=[512, 512], transform=transform)
+    # dataloader = DataLoader(dataset)
+    with torch.no_grad():
+        # for _, batch in enumerate(tqdm(dataloader, disable=True)):
+        image, meta = dataset[0]
+        image = image.unsqueeze(0)
+        # image, meta = batch
+        c = meta['center']
+        h = meta['height']
+        w = meta['width']
+        s = meta['scale']
+        output = session.run(None, {"input.1": image.numpy().astype(np.float32)})
+        upsample = torch.nn.Upsample(size=[512, 512], mode='bilinear', align_corners=True)
+        upsample_output = upsample(torch.from_numpy(output[1][0]).unsqueeze(0))
+        upsample_output = upsample_output.squeeze()
+        upsample_output = upsample_output.permute(1, 2, 0)  # CHW -> HWC
+        logits_result = transform_logits(upsample_output.data.cpu().numpy(), c, s, w, h, input_size=[512, 512])
+        parsing_result = np.argmax(logits_result, axis=2)
+        parsing_result = np.pad(parsing_result, pad_width=1, mode='constant', constant_values=0)
+        # try holefilling the clothes part
+        arm_mask = (parsing_result == 14).astype(np.float32) \
+            + (parsing_result == 15).astype(np.float32)
+        upper_cloth_mask = (parsing_result == 4).astype(np.float32) + arm_mask
+        img = np.where(upper_cloth_mask, 255, 0)
+        dst = hole_fill(img.astype(np.uint8))
+        parsing_result_filled = dst / 255 * 4
+        parsing_result_woarm = np.where(parsing_result_filled == 4, parsing_result_filled, parsing_result)
+        # add back arm and refined hole between arm and cloth
+        refine_hole_mask = refine_hole(parsing_result_filled.astype(np.uint8), parsing_result.astype(np.uint8),
+                                       arm_mask.astype(np.uint8))
+        parsing_result = np.where(refine_hole_mask, parsing_result, parsing_result_woarm)
+        # remove padding
+        parsing_result = parsing_result[1:-1, 1:-1]
+        dataset_lip = SimpleFolderDataset(root=input_dir, input_size=[473, 473], transform=transform)
+        # dataloader_lip = DataLoader(dataset_lip)
+        with torch.no_grad():
+            # for _, batch in enumerate(tqdm(dataloader_lip, disable=True)):
+            image, meta = dataset_lip[0]
+            image = image.unsqueeze(0)
+            # image, meta = batch
+            c = meta['center']
+            s = meta['scale']
+            w = meta['width']
+            h = meta['height']
+            output_lip = lip_session.run(None, {"input.1": image.numpy().astype(np.float32)})
+            upsample = torch.nn.Upsample(size=[473, 473], mode='bilinear', align_corners=True)
+            upsample_output_lip = upsample(torch.from_numpy(output_lip[1][0]).unsqueeze(0))
+            upsample_output_lip = upsample_output_lip.squeeze()
+            upsample_output_lip = upsample_output_lip.permute(1, 2, 0)  # CHW -> HWC
+            logits_result_lip = transform_logits(upsample_output_lip.data.cpu().numpy(), c, s, w, h,
+                                                 input_size=[473, 473])
+            parsing_result_lip = np.argmax(logits_result_lip, axis=2)
+    # add neck parsing result
+    neck_mask = np.logical_and(np.logical_not((parsing_result_lip == 13).astype(np.float32)),
+                               (parsing_result == 11).astype(np.float32))
+    parsing_result = np.where(neck_mask, 18, parsing_result)
+    palette = get_palette(19)
+    output_img = Image.fromarray(np.asarray(parsing_result, dtype=np.uint8))
+    output_img.putpalette(palette)
+    face_mask = torch.from_numpy((parsing_result == 11).astype(np.float32))
+    return output_img, face_mask

preprocess/humanparsing/run_parsing.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import pdb
+import sys
+from pathlib import Path
+import onnxruntime as ort
+PROJECT_ROOT = Path(__file__).absolute().parents[0].absolute()
+sys.path.insert(0, str(PROJECT_ROOT))
+import torch
+from parsing_api import onnx_inference
+class Parsing:
+    def __init__(self, gpu_id: int):
+        self.gpu_id = gpu_id
+        # torch.cuda.set_device(gpu_id)
+        session_options = ort.SessionOptions()
+        session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        session_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+        #### jho modified >>>>
+        providers = [
+            ('CUDAExecutionProvider', {
+                'device_id': gpu_id,
+            }),
+            'CPUExecutionProvider',
+        ]
+        self.session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'checkpoints/humanparsing/parsing_atr.onnx'),
+                                            sess_options=session_options, providers=providers)
+        self.lip_session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'checkpoints/humanparsing/parsing_lip.onnx'),
+                                                sess_options=session_options, providers=providers)
+        #### jho modified <<<<
+        # session_options.add_session_config_entry('gpu_id', str(gpu_id))
+        # self.session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'checkpoints/humanparsing/parsing_atr.onnx'),
+        #                                     sess_options=session_options, providers=['CUDAExecutionProvider'])
+        # self.lip_session = ort.InferenceSession(os.path.join(Path(__file__).absolute().parents[2].absolute(), 'checkpoints/humanparsing/parsing_lip.onnx'),
+        #                                         sess_options=session_options, providers=['CUDAExecutionProvider'])
+        print(f"parsing init done (gpu: {gpu_id})")
+    def __call__(self, input_image):
+        torch.cuda.set_device(self.gpu_id)
+        parsed_image, face_mask = onnx_inference(self.session, self.lip_session, input_image)
+        return parsed_image, face_mask

preprocess/humanparsing/utils/__init__.py ADDED Viewed

File without changes

preprocess/humanparsing/utils/consistency_loss.py ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   kl_loss.py
+@Time    :   7/23/19 4:02 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+from datasets.target_generation import generate_edge_tensor
+class ConsistencyLoss(nn.Module):
+    def __init__(self, ignore_index=255):
+        super(ConsistencyLoss, self).__init__()
+        self.ignore_index=ignore_index
+    def forward(self, parsing, edge, label):
+        parsing_pre = torch.argmax(parsing, dim=1)
+        parsing_pre[label==self.ignore_index]=self.ignore_index
+        generated_edge = generate_edge_tensor(parsing_pre)
+        edge_pre = torch.argmax(edge, dim=1)
+        v_generate_edge = generated_edge[label!=255]
+        v_edge_pre = edge_pre[label!=255]
+        v_edge_pre = v_edge_pre.type(torch.cuda.FloatTensor)
+        positive_union = (v_generate_edge==1)&(v_edge_pre==1) # only the positive values count
+        return F.smooth_l1_loss(v_generate_edge[positive_union].squeeze(0), v_edge_pre[positive_union].squeeze(0))

preprocess/humanparsing/utils/criterion.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   criterion.py
+@Time    :   8/30/19 8:59 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch.nn as nn
+import torch
+import numpy as np
+from torch.nn import functional as F
+from .lovasz_softmax import LovaszSoftmax
+from .kl_loss import KLDivergenceLoss
+from .consistency_loss import ConsistencyLoss
+NUM_CLASSES = 20
+class CriterionAll(nn.Module):
+    def __init__(self, use_class_weight=False, ignore_index=255, lambda_1=1, lambda_2=1, lambda_3=1,
+                 num_classes=20):
+        super(CriterionAll, self).__init__()
+        self.ignore_index = ignore_index
+        self.use_class_weight = use_class_weight
+        self.criterion = torch.nn.CrossEntropyLoss(ignore_index=ignore_index)
+        self.lovasz = LovaszSoftmax(ignore_index=ignore_index)
+        self.kldiv = KLDivergenceLoss(ignore_index=ignore_index)
+        self.reg = ConsistencyLoss(ignore_index=ignore_index)
+        self.lamda_1 = lambda_1
+        self.lamda_2 = lambda_2
+        self.lamda_3 = lambda_3
+        self.num_classes = num_classes
+    def parsing_loss(self, preds, target, cycle_n=None):
+        """
+        Loss function definition.
+        Args:
+            preds: [[parsing result1, parsing result2],[edge result]]
+            target: [parsing label, egde label]
+            soft_preds: [[parsing result1, parsing result2],[edge result]]
+        Returns:
+            Calculated Loss.
+        """
+        h, w = target[0].size(1), target[0].size(2)
+        pos_num = torch.sum(target[1] == 1, dtype=torch.float)
+        neg_num = torch.sum(target[1] == 0, dtype=torch.float)
+        weight_pos = neg_num / (pos_num + neg_num)
+        weight_neg = pos_num / (pos_num + neg_num)
+        weights = torch.tensor([weight_neg, weight_pos])  # edge loss weight
+        loss = 0
+        # loss for segmentation
+        preds_parsing = preds[0]
+        for pred_parsing in preds_parsing:
+            scale_pred = F.interpolate(input=pred_parsing, size=(h, w),
+                                       mode='bilinear', align_corners=True)
+            loss += 0.5 * self.lamda_1 * self.lovasz(scale_pred, target[0])
+            if target[2] is None:
+                loss += 0.5 * self.lamda_1 * self.criterion(scale_pred, target[0])
+            else:
+                soft_scale_pred = F.interpolate(input=target[2], size=(h, w),
+                                                mode='bilinear', align_corners=True)
+                soft_scale_pred = moving_average(soft_scale_pred, to_one_hot(target[0], num_cls=self.num_classes),
+                                                 1.0 / (cycle_n + 1.0))
+                loss += 0.5 * self.lamda_1 * self.kldiv(scale_pred, soft_scale_pred, target[0])
+        # loss for edge
+        preds_edge = preds[1]
+        for pred_edge in preds_edge:
+            scale_pred = F.interpolate(input=pred_edge, size=(h, w),
+                                       mode='bilinear', align_corners=True)
+            if target[3] is None:
+                loss += self.lamda_2 * F.cross_entropy(scale_pred, target[1],
+                                                       weights.cuda(), ignore_index=self.ignore_index)
+            else:
+                soft_scale_edge = F.interpolate(input=target[3], size=(h, w),
+                                                mode='bilinear', align_corners=True)
+                soft_scale_edge = moving_average(soft_scale_edge, to_one_hot(target[1], num_cls=2),
+                                                 1.0 / (cycle_n + 1.0))
+                loss += self.lamda_2 * self.kldiv(scale_pred, soft_scale_edge, target[0])
+        # consistency regularization
+        preds_parsing = preds[0]
+        preds_edge = preds[1]
+        for pred_parsing in preds_parsing:
+            scale_pred = F.interpolate(input=pred_parsing, size=(h, w),
+                                       mode='bilinear', align_corners=True)
+            scale_edge = F.interpolate(input=preds_edge[0], size=(h, w),
+                                       mode='bilinear', align_corners=True)
+            loss += self.lamda_3 * self.reg(scale_pred, scale_edge, target[0])
+        return loss
+    def forward(self, preds, target, cycle_n=None):
+        loss = self.parsing_loss(preds, target, cycle_n)
+        return loss
+    def _generate_weights(self, masks, num_classes):
+        """
+        masks: torch.Tensor with shape [B, H, W]
+        """
+        masks_label = masks.data.cpu().numpy().astype(np.int64)
+        pixel_nums = []
+        tot_pixels = 0
+        for i in range(num_classes):
+            pixel_num_of_cls_i = np.sum(masks_label == i).astype(np.float)
+            pixel_nums.append(pixel_num_of_cls_i)
+            tot_pixels += pixel_num_of_cls_i
+        weights = []
+        for i in range(num_classes):
+            weights.append(
+                (tot_pixels - pixel_nums[i]) / tot_pixels / (num_classes - 1)
+            )
+        weights = np.array(weights, dtype=np.float)
+        # weights = torch.from_numpy(weights).float().to(masks.device)
+        return weights
+def moving_average(target1, target2, alpha=1.0):
+    target = 0
+    target += (1.0 - alpha) * target1
+    target += target2 * alpha
+    return target
+def to_one_hot(tensor, num_cls, dim=1, ignore_index=255):
+    b, h, w = tensor.shape
+    tensor[tensor == ignore_index] = 0
+    onehot_tensor = torch.zeros(b, num_cls, h, w).cuda()
+    onehot_tensor.scatter_(dim, tensor.unsqueeze(dim), 1)
+    return onehot_tensor

preprocess/humanparsing/utils/encoding.py ADDED Viewed

	@@ -0,0 +1,187 @@

+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+## Created by: Hang Zhang
+## ECE Department, Rutgers University
+## Email: zhang.hang@rutgers.edu
+## Copyright (c) 2017
+##
+## This source code is licensed under the MIT-style license found in the
+## LICENSE file in the root directory of this source tree
+##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+"""Encoding Data Parallel"""
+import threading
+import torch
+from torch.autograd import Variable, Function
+import torch.cuda.comm as comm
+from torch.nn.parallel.data_parallel import DataParallel
+from torch.nn.parallel.parallel_apply import get_a_var
+from torch.nn.parallel._functions import Broadcast
+torch_ver = torch.__version__[:3]
+__all__ = ['allreduce', 'DataParallelModel', 'DataParallelCriterion', 'patch_replication_callback']
+def allreduce(*inputs):
+    """Cross GPU all reduce autograd operation for calculate mean and
+    variance in SyncBN.
+    """
+    return AllReduce.apply(*inputs)
+class AllReduce(Function):
+    @staticmethod
+    def forward(ctx, num_inputs, *inputs):
+        ctx.num_inputs = num_inputs
+        ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
+        inputs = [inputs[i:i + num_inputs]
+                 for i in range(0, len(inputs), num_inputs)]
+        # sort before reduce sum
+        inputs = sorted(inputs, key=lambda i: i[0].get_device())
+        results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
+        outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
+        return tuple([t for tensors in outputs for t in tensors])
+    @staticmethod
+    def backward(ctx, *inputs):
+        inputs = [i.data for i in inputs]
+        inputs = [inputs[i:i + ctx.num_inputs]
+                 for i in range(0, len(inputs), ctx.num_inputs)]
+        results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
+        outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
+        return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
+class Reduce(Function):
+    @staticmethod
+    def forward(ctx, *inputs):
+        ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
+        inputs = sorted(inputs, key=lambda i: i.get_device())
+        return comm.reduce_add(inputs)
+    @staticmethod
+    def backward(ctx, gradOutput):
+        return Broadcast.apply(ctx.target_gpus, gradOutput)
+class DataParallelModel(DataParallel):
+    """Implements data parallelism at the module level.
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the
+    batch dimension.
+    In the forward pass, the module is replicated on each device,
+    and each replica handles a portion of the input. During the backwards pass, gradients from each replica are summed into the original module.
+    Note that the outputs are not gathered, please use compatible
+    :class:`encoding.parallel.DataParallelCriterion`.
+    The batch size should be larger than the number of GPUs used. It should
+    also be an integer multiple of the number of GPUs so that each chunk is
+    the same size (so that each GPU processes the same number of samples).
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+    Reference:
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+    Example::
+        >>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
+        >>> y = net(x)
+    """
+    def gather(self, outputs, output_device):
+        return outputs
+    def replicate(self, module, device_ids):
+        modules = super(DataParallelModel, self).replicate(module, device_ids)
+        return modules
+class DataParallelCriterion(DataParallel):
+    """
+    Calculate loss in multiple-GPUs, which balance the memory usage for
+    Semantic Segmentation.
+    The targets are splitted across the specified devices by chunking in
+    the batch dimension. Please use together with :class:`encoding.parallel.DataParallelModel`.
+    Reference:
+        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
+        Amit Agrawal. “Context Encoding for Semantic Segmentation.
+        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
+    Example::
+        >>> net = encoding.nn.DataParallelModel(model, device_ids=[0, 1, 2])
+        >>> criterion = encoding.nn.DataParallelCriterion(criterion, device_ids=[0, 1, 2])
+        >>> y = net(x)
+        >>> loss = criterion(y, target)
+    """
+    def forward(self, inputs, *targets, **kwargs):
+        # input should be already scatterd
+        # scattering the targets instead
+        if not self.device_ids:
+            return self.module(inputs, *targets, **kwargs)
+        targets, kwargs = self.scatter(targets, kwargs, self.device_ids)
+        if len(self.device_ids) == 1:
+            return self.module(inputs, *targets[0], **kwargs[0])
+        replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
+        outputs = _criterion_parallel_apply(replicas, inputs, targets, kwargs)
+        return Reduce.apply(*outputs) / len(outputs)
+def _criterion_parallel_apply(modules, inputs, targets, kwargs_tup=None, devices=None):
+    assert len(modules) == len(inputs)
+    assert len(targets) == len(inputs)
+    if kwargs_tup:
+        assert len(modules) == len(kwargs_tup)
+    else:
+        kwargs_tup = ({},) * len(modules)
+    if devices is not None:
+        assert len(modules) == len(devices)
+    else:
+        devices = [None] * len(modules)
+    lock = threading.Lock()
+    results = {}
+    if torch_ver != "0.3":
+        grad_enabled = torch.is_grad_enabled()
+    def _worker(i, module, input, target, kwargs, device=None):
+        if torch_ver != "0.3":
+            torch.set_grad_enabled(grad_enabled)
+        if device is None:
+            device = get_a_var(input).get_device()
+        try:
+            if not isinstance(input, tuple):
+                input = (input,)
+            with torch.cuda.device(device):
+                output = module(*(input + target), **kwargs)
+            with lock:
+                results[i] = output
+        except Exception as e:
+            with lock:
+                results[i] = e
+    if len(modules) > 1:
+        threads = [threading.Thread(target=_worker,
+                                    args=(i, module, input, target,
+                                          kwargs, device),)
+                   for i, (module, input, target, kwargs, device) in
+                   enumerate(zip(modules, inputs, targets, kwargs_tup, devices))]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+    else:
+        _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, Exception):
+            raise output
+        outputs.append(output)
+    return outputs

preprocess/humanparsing/utils/kl_loss.py ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   kl_loss.py
+@Time    :   7/23/19 4:02 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import torch.nn.functional as F
+from torch import nn
+def flatten_probas(input, target, labels, ignore=255):
+    """
+    Flattens predictions in the batch.
+    """
+    B, C, H, W = input.size()
+    input = input.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    target = target.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    labels = labels.view(-1)
+    if ignore is None:
+        return input, target
+    valid = (labels != ignore)
+    vinput = input[valid.nonzero().squeeze()]
+    vtarget = target[valid.nonzero().squeeze()]
+    return vinput, vtarget
+class KLDivergenceLoss(nn.Module):
+    def __init__(self, ignore_index=255, T=1):
+        super(KLDivergenceLoss, self).__init__()
+        self.ignore_index=ignore_index
+        self.T = T
+    def forward(self, input, target, label):
+        log_input_prob = F.log_softmax(input / self.T, dim=1)
+        target_porb = F.softmax(target / self.T, dim=1)
+        loss = F.kl_div(*flatten_probas(log_input_prob, target_porb, label, ignore=self.ignore_index))
+        return self.T*self.T*loss # balanced

preprocess/humanparsing/utils/lovasz_softmax.py ADDED Viewed

	@@ -0,0 +1,279 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   lovasz_softmax.py
+@Time    :   8/30/19 7:12 PM
+@Desc    :   Lovasz-Softmax and Jaccard hinge loss in PyTorch
+             Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License)
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+from __future__ import print_function, division
+import torch
+from torch.autograd import Variable
+import torch.nn.functional as F
+import numpy as np
+from torch import nn
+try:
+    from itertools import ifilterfalse
+except ImportError:  # py3k
+    from itertools import filterfalse as ifilterfalse
+def lovasz_grad(gt_sorted):
+    """
+    Computes gradient of the Lovasz extension w.r.t sorted errors
+    See Alg. 1 in paper
+    """
+    p = len(gt_sorted)
+    gts = gt_sorted.sum()
+    intersection = gts - gt_sorted.float().cumsum(0)
+    union = gts + (1 - gt_sorted).float().cumsum(0)
+    jaccard = 1. - intersection / union
+    if p > 1:  # cover 1-pixel case
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True):
+    """
+    IoU for foreground class
+    binary: 1 foreground, 0 background
+    """
+    if not per_image:
+        preds, labels = (preds,), (labels,)
+    ious = []
+    for pred, label in zip(preds, labels):
+        intersection = ((label == 1) & (pred == 1)).sum()
+        union = ((label == 1) | ((pred == 1) & (label != ignore))).sum()
+        if not union:
+            iou = EMPTY
+        else:
+            iou = float(intersection) / float(union)
+        ious.append(iou)
+    iou = mean(ious)  # mean accross images if per_image
+    return 100 * iou
+def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False):
+    """
+    Array of IoU for each (non ignored) class
+    """
+    if not per_image:
+        preds, labels = (preds,), (labels,)
+    ious = []
+    for pred, label in zip(preds, labels):
+        iou = []
+        for i in range(C):
+            if i != ignore:  # The ignored label is sometimes among predicted classes (ENet - CityScapes)
+                intersection = ((label == i) & (pred == i)).sum()
+                union = ((label == i) | ((pred == i) & (label != ignore))).sum()
+                if not union:
+                    iou.append(EMPTY)
+                else:
+                    iou.append(float(intersection) / float(union))
+        ious.append(iou)
+    ious = [mean(iou) for iou in zip(*ious)]  # mean accross images if per_image
+    return 100 * np.array(ious)
+# --------------------------- BINARY LOSSES ---------------------------
+def lovasz_hinge(logits, labels, per_image=True, ignore=None):
+    """
+    Binary Lovasz hinge loss
+      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
+      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
+      per_image: compute the loss per image instead of per batch
+      ignore: void class id
+    """
+    if per_image:
+        loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore))
+                    for log, lab in zip(logits, labels))
+    else:
+        loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore))
+    return loss
+def lovasz_hinge_flat(logits, labels):
+    """
+    Binary Lovasz hinge loss
+      logits: [P] Variable, logits at each prediction (between -\infty and +\infty)
+      labels: [P] Tensor, binary ground truth labels (0 or 1)
+      ignore: label to ignore
+    """
+    if len(labels) == 0:
+        # only void pixels, the gradients should be 0
+        return logits.sum() * 0.
+    signs = 2. * labels.float() - 1.
+    errors = (1. - logits * Variable(signs))
+    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
+    perm = perm.data
+    gt_sorted = labels[perm]
+    grad = lovasz_grad(gt_sorted)
+    loss = torch.dot(F.relu(errors_sorted), Variable(grad))
+    return loss
+def flatten_binary_scores(scores, labels, ignore=None):
+    """
+    Flattens predictions in the batch (binary case)
+    Remove labels equal to 'ignore'
+    """
+    scores = scores.view(-1)
+    labels = labels.view(-1)
+    if ignore is None:
+        return scores, labels
+    valid = (labels != ignore)
+    vscores = scores[valid]
+    vlabels = labels[valid]
+    return vscores, vlabels
+class StableBCELoss(torch.nn.modules.Module):
+    def __init__(self):
+        super(StableBCELoss, self).__init__()
+    def forward(self, input, target):
+        neg_abs = - input.abs()
+        loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log()
+        return loss.mean()
+def binary_xloss(logits, labels, ignore=None):
+    """
+    Binary Cross entropy loss
+      logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty)
+      labels: [B, H, W] Tensor, binary ground truth masks (0 or 1)
+      ignore: void class id
+    """
+    logits, labels = flatten_binary_scores(logits, labels, ignore)
+    loss = StableBCELoss()(logits, Variable(labels.float()))
+    return loss
+# --------------------------- MULTICLASS LOSSES ---------------------------
+def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=255, weighted=None):
+    """
+    Multi-class Lovasz-Softmax loss
+      probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1).
+              Interpreted as binary (sigmoid) output with outputs of size [B, H, W].
+      labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1)
+      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+      per_image: compute the loss per image instead of per batch
+      ignore: void class labels
+    """
+    if per_image:
+        loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes, weighted=weighted)
+                    for prob, lab in zip(probas, labels))
+    else:
+        loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes, weighted=weighted )
+    return loss
+def lovasz_softmax_flat(probas, labels, classes='present', weighted=None):
+    """
+    Multi-class Lovasz-Softmax loss
+      probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
+      labels: [P] Tensor, ground truth labels (between 0 and C - 1)
+      classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average.
+    """
+    if probas.numel() == 0:
+        # only void pixels, the gradients should be 0
+        return probas * 0.
+    C = probas.size(1)
+    losses = []
+    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
+    for c in class_to_sum:
+        fg = (labels == c).float()  # foreground for class c
+        if (classes is 'present' and fg.sum() == 0):
+            continue
+        if C == 1:
+            if len(classes) > 1:
+                raise ValueError('Sigmoid output possible only with 1 class')
+            class_pred = probas[:, 0]
+        else:
+            class_pred = probas[:, c]
+        errors = (Variable(fg) - class_pred).abs()
+        errors_sorted, perm = torch.sort(errors, 0, descending=True)
+        perm = perm.data
+        fg_sorted = fg[perm]
+        if weighted is not None:
+            losses.append(weighted[c]*torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
+        else:
+            losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted))))
+    return mean(losses)
+def flatten_probas(probas, labels, ignore=None):
+    """
+    Flattens predictions in the batch
+    """
+    if probas.dim() == 3:
+        # assumes output of a sigmoid layer
+        B, H, W = probas.size()
+        probas = probas.view(B, 1, H, W)
+    B, C, H, W = probas.size()
+    probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    labels = labels.view(-1)
+    if ignore is None:
+        return probas, labels
+    valid = (labels != ignore)
+    vprobas = probas[valid.nonzero().squeeze()]
+    vlabels = labels[valid]
+    return vprobas, vlabels
+def xloss(logits, labels, ignore=None):
+    """
+    Cross entropy loss
+    """
+    return F.cross_entropy(logits, Variable(labels), ignore_index=255)
+# --------------------------- HELPER FUNCTIONS ---------------------------
+def isnan(x):
+    return x != x
+def mean(l, ignore_nan=False, empty=0):
+    """
+    nanmean compatible with generators.
+    """
+    l = iter(l)
+    if ignore_nan:
+        l = ifilterfalse(isnan, l)
+    try:
+        n = 1
+        acc = next(l)
+    except StopIteration:
+        if empty == 'raise':
+            raise ValueError('Empty mean')
+        return empty
+    for n, v in enumerate(l, 2):
+        acc += v
+    if n == 1:
+        return acc
+    return acc / n
+# --------------------------- Class ---------------------------
+class LovaszSoftmax(nn.Module):
+    def __init__(self, per_image=False, ignore_index=255, weighted=None):
+        super(LovaszSoftmax, self).__init__()
+        self.lovasz_softmax = lovasz_softmax
+        self.per_image = per_image
+        self.ignore_index=ignore_index
+        self.weighted = weighted
+    def forward(self, pred, label):
+        pred = F.softmax(pred, dim=1)
+        return self.lovasz_softmax(pred, label, per_image=self.per_image, ignore=self.ignore_index, weighted=self.weighted)

preprocess/humanparsing/utils/miou.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import cv2
+import os
+import numpy as np
+from collections import OrderedDict
+from PIL import Image as PILImage
+from utils.transforms import transform_parsing
+LABELS = ['Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'Upper-clothes', 'Dress', 'Coat', \
+          'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm', 'Right-arm', 'Left-leg',
+          'Right-leg', 'Left-shoe', 'Right-shoe']
+# LABELS = ['Background', 'Head', 'Torso', 'Upper Arms', 'Lower Arms', 'Upper Legs', 'Lower Legs']
+def get_palette(num_cls):
+    """ Returns the color map for visualizing the segmentation mask.
+    Args:
+        num_cls: Number of classes
+    Returns:
+        The color map
+    """
+    n = num_cls
+    palette = [0] * (n * 3)
+    for j in range(0, n):
+        lab = j
+        palette[j * 3 + 0] = 0
+        palette[j * 3 + 1] = 0
+        palette[j * 3 + 2] = 0
+        i = 0
+        while lab:
+            palette[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+            palette[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+            palette[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+            i += 1
+            lab >>= 3
+    return palette
+def get_confusion_matrix(gt_label, pred_label, num_classes):
+    """
+    Calcute the confusion matrix by given label and pred
+    :param gt_label: the ground truth label
+    :param pred_label: the pred label
+    :param num_classes: the nunber of class
+    :return: the confusion matrix
+    """
+    index = (gt_label * num_classes + pred_label).astype('int32')
+    label_count = np.bincount(index)
+    confusion_matrix = np.zeros((num_classes, num_classes))
+    for i_label in range(num_classes):
+        for i_pred_label in range(num_classes):
+            cur_index = i_label * num_classes + i_pred_label
+            if cur_index < len(label_count):
+                confusion_matrix[i_label, i_pred_label] = label_count[cur_index]
+    return confusion_matrix
+def compute_mean_ioU(preds, scales, centers, num_classes, datadir, input_size=[473, 473], dataset='val'):
+    val_file = os.path.join(datadir, dataset + '_id.txt')
+    val_id = [i_id.strip() for i_id in open(val_file)]
+    confusion_matrix = np.zeros((num_classes, num_classes))
+    for i, pred_out in enumerate(preds):
+        im_name = val_id[i]
+        gt_path = os.path.join(datadir, dataset + '_segmentations', im_name + '.png')
+        gt = np.array(PILImage.open(gt_path))
+        h, w = gt.shape
+        s = scales[i]
+        c = centers[i]
+        pred = transform_parsing(pred_out, c, s, w, h, input_size)
+        gt = np.asarray(gt, dtype=np.int32)
+        pred = np.asarray(pred, dtype=np.int32)
+        ignore_index = gt != 255
+        gt = gt[ignore_index]
+        pred = pred[ignore_index]
+        confusion_matrix += get_confusion_matrix(gt, pred, num_classes)
+    pos = confusion_matrix.sum(1)
+    res = confusion_matrix.sum(0)
+    tp = np.diag(confusion_matrix)
+    pixel_accuracy = (tp.sum() / pos.sum()) * 100
+    mean_accuracy = ((tp / np.maximum(1.0, pos)).mean()) * 100
+    IoU_array = (tp / np.maximum(1.0, pos + res - tp))
+    IoU_array = IoU_array * 100
+    mean_IoU = IoU_array.mean()
+    print('Pixel accuracy: %f \n' % pixel_accuracy)
+    print('Mean accuracy: %f \n' % mean_accuracy)
+    print('Mean IU: %f \n' % mean_IoU)
+    name_value = []
+    for i, (label, iou) in enumerate(zip(LABELS, IoU_array)):
+        name_value.append((label, iou))
+    name_value.append(('Pixel accuracy', pixel_accuracy))
+    name_value.append(('Mean accuracy', mean_accuracy))
+    name_value.append(('Mean IU', mean_IoU))
+    name_value = OrderedDict(name_value)
+    return name_value
+def compute_mean_ioU_file(preds_dir, num_classes, datadir, dataset='val'):
+    list_path = os.path.join(datadir, dataset + '_id.txt')
+    val_id = [i_id.strip() for i_id in open(list_path)]
+    confusion_matrix = np.zeros((num_classes, num_classes))
+    for i, im_name in enumerate(val_id):
+        gt_path = os.path.join(datadir, 'segmentations', im_name + '.png')
+        gt = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)
+        pred_path = os.path.join(preds_dir, im_name + '.png')
+        pred = np.asarray(PILImage.open(pred_path))
+        gt = np.asarray(gt, dtype=np.int32)
+        pred = np.asarray(pred, dtype=np.int32)
+        ignore_index = gt != 255
+        gt = gt[ignore_index]
+        pred = pred[ignore_index]
+        confusion_matrix += get_confusion_matrix(gt, pred, num_classes)
+    pos = confusion_matrix.sum(1)
+    res = confusion_matrix.sum(0)
+    tp = np.diag(confusion_matrix)
+    pixel_accuracy = (tp.sum() / pos.sum()) * 100
+    mean_accuracy = ((tp / np.maximum(1.0, pos)).mean()) * 100
+    IoU_array = (tp / np.maximum(1.0, pos + res - tp))
+    IoU_array = IoU_array * 100
+    mean_IoU = IoU_array.mean()
+    print('Pixel accuracy: %f \n' % pixel_accuracy)
+    print('Mean accuracy: %f \n' % mean_accuracy)
+    print('Mean IU: %f \n' % mean_IoU)
+    name_value = []
+    for i, (label, iou) in enumerate(zip(LABELS, IoU_array)):
+        name_value.append((label, iou))
+    name_value.append(('Pixel accuracy', pixel_accuracy))
+    name_value.append(('Mean accuracy', mean_accuracy))
+    name_value.append(('Mean IU', mean_IoU))
+    name_value = OrderedDict(name_value)
+    return name_value

preprocess/humanparsing/utils/schp.py ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   schp.py
+@Time    :   4/8/19 2:11 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+import os
+import torch
+import modules
+def moving_average(net1, net2, alpha=1):
+    for param1, param2 in zip(net1.parameters(), net2.parameters()):
+        param1.data *= (1.0 - alpha)
+        param1.data += param2.data * alpha
+def _check_bn(module, flag):
+    if issubclass(module.__class__, modules.bn.InPlaceABNSync):
+        flag[0] = True
+def check_bn(model):
+    flag = [False]
+    model.apply(lambda module: _check_bn(module, flag))
+    return flag[0]
+def reset_bn(module):
+    if issubclass(module.__class__, modules.bn.InPlaceABNSync):
+        module.running_mean = torch.zeros_like(module.running_mean)
+        module.running_var = torch.ones_like(module.running_var)
+def _get_momenta(module, momenta):
+    if issubclass(module.__class__, modules.bn.InPlaceABNSync):
+        momenta[module] = module.momentum
+def _set_momenta(module, momenta):
+    if issubclass(module.__class__, modules.bn.InPlaceABNSync):
+        module.momentum = momenta[module]
+def bn_re_estimate(loader, model):
+    if not check_bn(model):
+        print('No batch norm layer detected')
+        return
+    model.train()
+    momenta = {}
+    model.apply(reset_bn)
+    model.apply(lambda module: _get_momenta(module, momenta))
+    n = 0
+    for i_iter, batch in enumerate(loader):
+        images, labels, _ = batch
+        b = images.data.size(0)
+        momentum = b / (n + b)
+        for module in momenta.keys():
+            module.momentum = momentum
+        model(images)
+        n += b
+    model.apply(lambda module: _set_momenta(module, momenta))
+def save_schp_checkpoint(states, is_best_parsing, output_dir, filename='schp_checkpoint.pth.tar'):
+    save_path = os.path.join(output_dir, filename)
+    if os.path.exists(save_path):
+        os.remove(save_path)
+    torch.save(states, save_path)
+    if is_best_parsing and 'state_dict' in states:
+        best_save_path = os.path.join(output_dir, 'model_parsing_best.pth.tar')
+        if os.path.exists(best_save_path):
+            os.remove(best_save_path)
+        torch.save(states, best_save_path)

preprocess/humanparsing/utils/soft_dice_loss.py ADDED Viewed

	@@ -0,0 +1,111 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author  :   Peike Li
+@Contact :   peike.li@yahoo.com
+@File    :   soft_dice_loss.py
+@Time    :   8/13/19 5:09 PM
+@Desc    :
+@License :   This source code is licensed under the license found in the
+             LICENSE file in the root directory of this source tree.
+"""
+from __future__ import print_function, division
+import torch
+import torch.nn.functional as F
+from torch import nn
+try:
+    from itertools import ifilterfalse
+except ImportError:  # py3k
+    from itertools import filterfalse as ifilterfalse
+def tversky_loss(probas, labels, alpha=0.5, beta=0.5, epsilon=1e-6):
+    '''
+    Tversky loss function.
+        probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1)
+        labels: [P] Tensor, ground truth labels (between 0 and C - 1)
+    Same as soft dice loss when alpha=beta=0.5.
+    Same as Jaccord loss when alpha=beta=1.0.
+    See `Tversky loss function for image segmentation using 3D fully convolutional deep networks`
+    https://arxiv.org/pdf/1706.05721.pdf
+    '''
+    C = probas.size(1)
+    losses = []
+    for c in list(range(C)):
+        fg = (labels == c).float()
+        if fg.sum() == 0:
+            continue
+        class_pred = probas[:, c]
+        p0 = class_pred
+        p1 = 1 - class_pred
+        g0 = fg
+        g1 = 1 - fg
+        numerator = torch.sum(p0 * g0)
+        denominator = numerator + alpha * torch.sum(p0 * g1) + beta * torch.sum(p1 * g0)
+        losses.append(1 - ((numerator) / (denominator + epsilon)))
+    return mean(losses)
+def flatten_probas(probas, labels, ignore=255):
+    """
+    Flattens predictions in the batch
+    """
+    B, C, H, W = probas.size()
+    probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B * H * W, C = P, C
+    labels = labels.view(-1)
+    if ignore is None:
+        return probas, labels
+    valid = (labels != ignore)
+    vprobas = probas[valid.nonzero().squeeze()]
+    vlabels = labels[valid]
+    return vprobas, vlabels
+def isnan(x):
+    return x != x
+def mean(l, ignore_nan=False, empty=0):
+    """
+    nanmean compatible with generators.
+    """
+    l = iter(l)
+    if ignore_nan:
+        l = ifilterfalse(isnan, l)
+    try:
+        n = 1
+        acc = next(l)
+    except StopIteration:
+        if empty == 'raise':
+            raise ValueError('Empty mean')
+        return empty
+    for n, v in enumerate(l, 2):
+        acc += v
+    if n == 1:
+        return acc
+    return acc / n
+class SoftDiceLoss(nn.Module):
+    def __init__(self, ignore_index=255):
+        super(SoftDiceLoss, self).__init__()
+        self.ignore_index = ignore_index
+    def forward(self, pred, label):
+        pred = F.softmax(pred, dim=1)
+        return tversky_loss(*flatten_probas(pred, label, ignore=self.ignore_index), alpha=0.5, beta=0.5)
+class SoftJaccordLoss(nn.Module):
+    def __init__(self, ignore_index=255):
+        super(SoftJaccordLoss, self).__init__()
+        self.ignore_index = ignore_index
+    def forward(self, pred, label):
+        pred = F.softmax(pred, dim=1)
+        return tversky_loss(*flatten_probas(pred, label, ignore=self.ignore_index), alpha=1.0, beta=1.0)

preprocess/humanparsing/utils/transforms.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import cv2
+import torch
+class BRG2Tensor_transform(object):
+    def __call__(self, pic):
+        img = torch.from_numpy(pic.transpose((2, 0, 1)))
+        if isinstance(img, torch.ByteTensor):
+            return img.float()
+        else:
+            return img
+class BGR2RGB_transform(object):
+    def __call__(self, tensor):
+        return tensor[[2,1,0],:,:]
+def flip_back(output_flipped, matched_parts):
+    '''
+    ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
+    '''
+    assert output_flipped.ndim == 4,\
+        'output_flipped should be [batch_size, num_joints, height, width]'
+    output_flipped = output_flipped[:, :, :, ::-1]
+    for pair in matched_parts:
+        tmp = output_flipped[:, pair[0], :, :].copy()
+        output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+        output_flipped[:, pair[1], :, :] = tmp
+    return output_flipped
+def fliplr_joints(joints, joints_vis, width, matched_parts):
+    """
+    flip coords
+    """
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+    # Change left-right parts
+    for pair in matched_parts:
+        joints[pair[0], :], joints[pair[1], :] = \
+            joints[pair[1], :], joints[pair[0], :].copy()
+        joints_vis[pair[0], :], joints_vis[pair[1], :] = \
+            joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
+    return joints*joints_vis, joints_vis
+def transform_preds(coords, center, scale, input_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+def transform_parsing(pred, center, scale, width, height, input_size):
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    target_pred = cv2.warpAffine(
+            pred,
+            trans,
+            (int(width), int(height)), #(int(width), int(height)),
+            flags=cv2.INTER_NEAREST,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0))
+    return target_pred
+def transform_logits(logits, center, scale, width, height, input_size):
+    trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+    channel = logits.shape[2]
+    target_logits = []
+    for i in range(channel):
+        target_logit = cv2.warpAffine(
+            logits[:,:,i],
+            trans,
+            (int(width), int(height)), #(int(width), int(height)),
+            flags=cv2.INTER_LINEAR,
+            borderMode=cv2.BORDER_CONSTANT,
+            borderValue=(0))
+        target_logits.append(target_logit)
+    target_logits = np.stack(target_logits,axis=2)
+    return target_logits
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+    scale_tmp = scale
+    src_w = scale_tmp[0]
+    dst_w = output_size[1]
+    dst_h = output_size[0]
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, (dst_w-1) * -0.5], np.float32)
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [(dst_w-1) * 0.5, (dst_h-1) * 0.5]
+    dst[1, :] = np.array([(dst_w-1) * 0.5, (dst_h-1) * 0.5]) + dst_dir
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return trans
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+    return src_result
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+    dst_img = cv2.warpAffine(img,
+                             trans,
+                             (int(output_size[1]), int(output_size[0])),
+                             flags=cv2.INTER_LINEAR)
+    return dst_img