diff --git a/Leffa/densepose/__init__.py b/Leffa/densepose/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b50a3da91dd0d2a69502af9d5d62f2f4280d973f --- /dev/null +++ b/Leffa/densepose/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from .data.datasets import builtin # just to register data +from .converters import builtin as builtin_converters # register converters +from .config import ( + add_densepose_config, + add_densepose_head_config, + add_hrnet_config, + add_dataset_category_config, + add_bootstrap_config, + load_bootstrap_config, +) +from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData +from .evaluation import DensePoseCOCOEvaluator +from .modeling.roi_heads import DensePoseROIHeads +from .modeling.test_time_augmentation import ( + DensePoseGeneralizedRCNNWithTTA, + DensePoseDatasetMapperTTA, +) +from .utils.transform import load_from_cfg +from .modeling.hrfpn import build_hrfpn_backbone diff --git a/Leffa/densepose/config.py b/Leffa/densepose/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2a06a09c80865ab987773511b2acc71e232b26ac --- /dev/null +++ b/Leffa/densepose/config.py @@ -0,0 +1,277 @@ +# -*- coding = utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. +# pyre-ignore-all-errors + +from detectron2.config import CfgNode as CN + + +def add_dataset_category_config(cfg: CN) -> None: + """ + Add config for additional category-related dataset options + - category whitelisting + - category mapping + """ + _C = cfg + _C.DATASETS.CATEGORY_MAPS = CN(new_allowed=True) + _C.DATASETS.WHITELISTED_CATEGORIES = CN(new_allowed=True) + # class to mesh mapping + _C.DATASETS.CLASS_TO_MESH_NAME_MAPPING = CN(new_allowed=True) + + +def add_evaluation_config(cfg: CN) -> None: + _C = cfg + _C.DENSEPOSE_EVALUATION = CN() + # evaluator type, possible values: + # - "iou": evaluator for models that produce iou data + # - "cse": evaluator for models that produce cse data + _C.DENSEPOSE_EVALUATION.TYPE = "iou" + # storage for DensePose results, possible values: + # - "none": no explicit storage, all the results are stored in the + # dictionary with predictions, memory intensive; + # historically the default storage type + # - "ram": RAM storage, uses per-process RAM storage, which is + # reduced to a single process storage on later stages, + # less memory intensive + # - "file": file storage, uses per-process file-based storage, + # the least memory intensive, but may create bottlenecks + # on file system accesses + _C.DENSEPOSE_EVALUATION.STORAGE = "none" + # minimum threshold for IOU values: the lower its values is, + # the more matches are produced (and the higher the AP score) + _C.DENSEPOSE_EVALUATION.MIN_IOU_THRESHOLD = 0.5 + # Non-distributed inference is slower (at inference time) but can avoid RAM OOM + _C.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE = True + # evaluate mesh alignment based on vertex embeddings, only makes sense in CSE context + _C.DENSEPOSE_EVALUATION.EVALUATE_MESH_ALIGNMENT = False + # meshes to compute mesh alignment for + _C.DENSEPOSE_EVALUATION.MESH_ALIGNMENT_MESH_NAMES = [] + + +def add_bootstrap_config(cfg: CN) -> None: + """ """ + _C = cfg + _C.BOOTSTRAP_DATASETS = [] + _C.BOOTSTRAP_MODEL = CN() + _C.BOOTSTRAP_MODEL.WEIGHTS = "" + _C.BOOTSTRAP_MODEL.DEVICE = "cuda" + + +def get_bootstrap_dataset_config() -> CN: + _C = CN() + _C.DATASET = "" + # ratio used to mix data loaders + _C.RATIO = 0.1 + # image loader + _C.IMAGE_LOADER = CN(new_allowed=True) + _C.IMAGE_LOADER.TYPE = "" + _C.IMAGE_LOADER.BATCH_SIZE = 4 + _C.IMAGE_LOADER.NUM_WORKERS = 4 + _C.IMAGE_LOADER.CATEGORIES = [] + _C.IMAGE_LOADER.MAX_COUNT_PER_CATEGORY = 1_000_000 + _C.IMAGE_LOADER.CATEGORY_TO_CLASS_MAPPING = CN(new_allowed=True) + # inference + _C.INFERENCE = CN() + # batch size for model inputs + _C.INFERENCE.INPUT_BATCH_SIZE = 4 + # batch size to group model outputs + _C.INFERENCE.OUTPUT_BATCH_SIZE = 2 + # sampled data + _C.DATA_SAMPLER = CN(new_allowed=True) + _C.DATA_SAMPLER.TYPE = "" + _C.DATA_SAMPLER.USE_GROUND_TRUTH_CATEGORIES = False + # filter + _C.FILTER = CN(new_allowed=True) + _C.FILTER.TYPE = "" + return _C + + +def load_bootstrap_config(cfg: CN) -> None: + """ + Bootstrap datasets are given as a list of `dict` that are not automatically + converted into CfgNode. This method processes all bootstrap dataset entries + and ensures that they are in CfgNode format and comply with the specification + """ + if not cfg.BOOTSTRAP_DATASETS: + return + + bootstrap_datasets_cfgnodes = [] + for dataset_cfg in cfg.BOOTSTRAP_DATASETS: + _C = get_bootstrap_dataset_config().clone() + _C.merge_from_other_cfg(CN(dataset_cfg)) + bootstrap_datasets_cfgnodes.append(_C) + cfg.BOOTSTRAP_DATASETS = bootstrap_datasets_cfgnodes + + +def add_densepose_head_cse_config(cfg: CN) -> None: + """ + Add configuration options for Continuous Surface Embeddings (CSE) + """ + _C = cfg + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE = CN() + # Dimensionality D of the embedding space + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE = 16 + # Embedder specifications for various mesh IDs + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS = CN(new_allowed=True) + # normalization coefficient for embedding distances + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_DIST_GAUSS_SIGMA = 0.01 + # normalization coefficient for geodesic distances + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.GEODESIC_DIST_GAUSS_SIGMA = 0.01 + # embedding loss weight + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_WEIGHT = 0.6 + # embedding loss name, currently the following options are supported: + # - EmbeddingLoss: cross-entropy on vertex labels + # - SoftEmbeddingLoss: cross-entropy on vertex label combined with + # Gaussian penalty on distance between vertices + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_LOSS_NAME = "EmbeddingLoss" + # optimizer hyperparameters + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.FEATURES_LR_FACTOR = 1.0 + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_LR_FACTOR = 1.0 + # Shape to shape cycle consistency loss parameters: + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS = CN({"ENABLED": False}) + # shape to shape cycle consistency loss weight + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.WEIGHT = 0.025 + # norm type used for loss computation + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.NORM_P = 2 + # normalization term for embedding similarity matrices + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.TEMPERATURE = 0.05 + # maximum number of vertices to include into shape to shape cycle loss + # if negative or zero, all vertices are considered + # if positive, random subset of vertices of given size is considered + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.SHAPE_TO_SHAPE_CYCLE_LOSS.MAX_NUM_VERTICES = 4936 + # Pixel to shape cycle consistency loss parameters: + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS = CN({"ENABLED": False}) + # pixel to shape cycle consistency loss weight + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.WEIGHT = 0.0001 + # norm type used for loss computation + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NORM_P = 2 + # map images to all meshes and back (if false, use only gt meshes from the batch) + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.USE_ALL_MESHES_NOT_GT_ONLY = False + # Randomly select at most this number of pixels from every instance + # if negative or zero, all vertices are considered + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.NUM_PIXELS_TO_SAMPLE = 100 + # normalization factor for pixel to pixel distances (higher value = smoother distribution) + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.PIXEL_SIGMA = 5.0 + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_PIXEL_TO_VERTEX = 0.05 + _C.MODEL.ROI_DENSEPOSE_HEAD.CSE.PIX_TO_SHAPE_CYCLE_LOSS.TEMPERATURE_VERTEX_TO_PIXEL = 0.05 + + +def add_densepose_head_config(cfg: CN) -> None: + """ + Add config for densepose head. + """ + _C = cfg + + _C.MODEL.DENSEPOSE_ON = True + + _C.MODEL.ROI_DENSEPOSE_HEAD = CN() + _C.MODEL.ROI_DENSEPOSE_HEAD.NAME = "" + _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8 + # Number of parts used for point labels + _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24 + _C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4 + _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512 + _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3 + _C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2 + _C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 112 + _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2" + _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 28 + _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2 + _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS = 2 # 15 or 2 + # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD) + _C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7 + # Loss weights for annotation masks.(14 Parts) + _C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 5.0 + # Loss weights for surface parts. (24 Parts) + _C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 1.0 + # Loss weights for UV regression. + _C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.01 + # Coarse segmentation is trained using instance segmentation task data + _C.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS = False + # For Decoder + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON = True + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES = 256 + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS = 256 + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM = "" + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE = 4 + # For DeepLab head + _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB = CN() + _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM = "GN" + _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON = 0 + # Predictor class name, must be registered in DENSEPOSE_PREDICTOR_REGISTRY + # Some registered predictors: + # "DensePoseChartPredictor": predicts segmentation and UV coordinates for predefined charts + # "DensePoseChartWithConfidencePredictor": predicts segmentation, UV coordinates + # and associated confidences for predefined charts (default) + # "DensePoseEmbeddingWithConfidencePredictor": predicts segmentation, embeddings + # and associated confidences for CSE + _C.MODEL.ROI_DENSEPOSE_HEAD.PREDICTOR_NAME = "DensePoseChartWithConfidencePredictor" + # Loss class name, must be registered in DENSEPOSE_LOSS_REGISTRY + # Some registered losses: + # "DensePoseChartLoss": loss for chart-based models that estimate + # segmentation and UV coordinates + # "DensePoseChartWithConfidenceLoss": loss for chart-based models that estimate + # segmentation, UV coordinates and the corresponding confidences (default) + _C.MODEL.ROI_DENSEPOSE_HEAD.LOSS_NAME = "DensePoseChartWithConfidenceLoss" + # Confidences + # Enable learning UV confidences (variances) along with the actual values + _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE = CN({"ENABLED": False}) + # UV confidence lower bound + _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON = 0.01 + # Enable learning segmentation confidences (variances) along with the actual values + _C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE = CN({"ENABLED": False}) + # Segmentation confidence lower bound + _C.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON = 0.01 + # Statistical model type for confidence learning, possible values: + # - "iid_iso": statistically independent identically distributed residuals + # with isotropic covariance + # - "indep_aniso": statistically independent residuals with anisotropic + # covariances + _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE = "iid_iso" + # List of angles for rotation in data augmentation during training + _C.INPUT.ROTATION_ANGLES = [0] + _C.TEST.AUG.ROTATION_ANGLES = () # Rotation TTA + + add_densepose_head_cse_config(cfg) + + +def add_hrnet_config(cfg: CN) -> None: + """ + Add config for HRNet backbone. + """ + _C = cfg + + # For HigherHRNet w32 + _C.MODEL.HRNET = CN() + _C.MODEL.HRNET.STEM_INPLANES = 64 + _C.MODEL.HRNET.STAGE2 = CN() + _C.MODEL.HRNET.STAGE2.NUM_MODULES = 1 + _C.MODEL.HRNET.STAGE2.NUM_BRANCHES = 2 + _C.MODEL.HRNET.STAGE2.BLOCK = "BASIC" + _C.MODEL.HRNET.STAGE2.NUM_BLOCKS = [4, 4] + _C.MODEL.HRNET.STAGE2.NUM_CHANNELS = [32, 64] + _C.MODEL.HRNET.STAGE2.FUSE_METHOD = "SUM" + _C.MODEL.HRNET.STAGE3 = CN() + _C.MODEL.HRNET.STAGE3.NUM_MODULES = 4 + _C.MODEL.HRNET.STAGE3.NUM_BRANCHES = 3 + _C.MODEL.HRNET.STAGE3.BLOCK = "BASIC" + _C.MODEL.HRNET.STAGE3.NUM_BLOCKS = [4, 4, 4] + _C.MODEL.HRNET.STAGE3.NUM_CHANNELS = [32, 64, 128] + _C.MODEL.HRNET.STAGE3.FUSE_METHOD = "SUM" + _C.MODEL.HRNET.STAGE4 = CN() + _C.MODEL.HRNET.STAGE4.NUM_MODULES = 3 + _C.MODEL.HRNET.STAGE4.NUM_BRANCHES = 4 + _C.MODEL.HRNET.STAGE4.BLOCK = "BASIC" + _C.MODEL.HRNET.STAGE4.NUM_BLOCKS = [4, 4, 4, 4] + _C.MODEL.HRNET.STAGE4.NUM_CHANNELS = [32, 64, 128, 256] + _C.MODEL.HRNET.STAGE4.FUSE_METHOD = "SUM" + + _C.MODEL.HRNET.HRFPN = CN() + _C.MODEL.HRNET.HRFPN.OUT_CHANNELS = 256 + + +def add_densepose_config(cfg: CN) -> None: + add_densepose_head_config(cfg) + add_hrnet_config(cfg) + add_bootstrap_config(cfg) + add_dataset_category_config(cfg) + add_evaluation_config(cfg) diff --git a/Leffa/densepose/converters/__init__.py b/Leffa/densepose/converters/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..930339e13f408ad46d0504fac557ef8cf0a57a56 --- /dev/null +++ b/Leffa/densepose/converters/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .hflip import HFlipConverter +from .to_mask import ToMaskConverter +from .to_chart_result import ToChartResultConverter, ToChartResultConverterWithConfidences +from .segm_to_mask import ( + predictor_output_with_fine_and_coarse_segm_to_mask, + predictor_output_with_coarse_segm_to_mask, + resample_fine_and_coarse_segm_to_bbox, +) +from .chart_output_to_chart_result import ( + densepose_chart_predictor_output_to_result, + densepose_chart_predictor_output_to_result_with_confidences, +) +from .chart_output_hflip import densepose_chart_predictor_output_hflip diff --git a/Leffa/densepose/converters/base.py b/Leffa/densepose/converters/base.py new file mode 100644 index 0000000000000000000000000000000000000000..c9dbe56cecff6dbbc1a1fda5a89c5f917513dcd8 --- /dev/null +++ b/Leffa/densepose/converters/base.py @@ -0,0 +1,93 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any, Tuple, Type +import torch + + +class BaseConverter: + """ + Converter base class to be reused by various converters. + Converter allows one to convert data from various source types to a particular + destination type. Each source type needs to register its converter. The + registration for each source type is valid for all descendants of that type. + """ + + @classmethod + def register(cls, from_type: Type, converter: Any = None): + """ + Registers a converter for the specified type. + Can be used as a decorator (if converter is None), or called as a method. + + Args: + from_type (type): type to register the converter for; + all instances of this type will use the same converter + converter (callable): converter to be registered for the given + type; if None, this method is assumed to be a decorator for the converter + """ + + if converter is not None: + cls._do_register(from_type, converter) + + def wrapper(converter: Any) -> Any: + cls._do_register(from_type, converter) + return converter + + return wrapper + + @classmethod + def _do_register(cls, from_type: Type, converter: Any): + cls.registry[from_type] = converter # pyre-ignore[16] + + @classmethod + def _lookup_converter(cls, from_type: Type) -> Any: + """ + Perform recursive lookup for the given type + to find registered converter. If a converter was found for some base + class, it gets registered for this class to save on further lookups. + + Args: + from_type: type for which to find a converter + Return: + callable or None - registered converter or None + if no suitable entry was found in the registry + """ + if from_type in cls.registry: # pyre-ignore[16] + return cls.registry[from_type] + for base in from_type.__bases__: + converter = cls._lookup_converter(base) + if converter is not None: + cls._do_register(from_type, converter) + return converter + return None + + @classmethod + def convert(cls, instance: Any, *args, **kwargs): + """ + Convert an instance to the destination type using some registered + converter. Does recursive lookup for base classes, so there's no need + for explicit registration for derived classes. + + Args: + instance: source instance to convert to the destination type + Return: + An instance of the destination type obtained from the source instance + Raises KeyError, if no suitable converter found + """ + instance_type = type(instance) + converter = cls._lookup_converter(instance_type) + if converter is None: + if cls.dst_type is None: # pyre-ignore[16] + output_type_str = "itself" + else: + output_type_str = cls.dst_type + raise KeyError(f"Could not find converter from {instance_type} to {output_type_str}") + return converter(instance, *args, **kwargs) + + +IntTupleBox = Tuple[int, int, int, int] + + +def make_int_box(box: torch.Tensor) -> IntTupleBox: + int_box = [0, 0, 0, 0] + int_box[0], int_box[1], int_box[2], int_box[3] = tuple(box.long().tolist()) + return int_box[0], int_box[1], int_box[2], int_box[3] diff --git a/Leffa/densepose/converters/builtin.py b/Leffa/densepose/converters/builtin.py new file mode 100644 index 0000000000000000000000000000000000000000..3bd48f8f7afc49cf38bf410f01bc673d446f37d7 --- /dev/null +++ b/Leffa/densepose/converters/builtin.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from ..structures import DensePoseChartPredictorOutput, DensePoseEmbeddingPredictorOutput +from . import ( + HFlipConverter, + ToChartResultConverter, + ToChartResultConverterWithConfidences, + ToMaskConverter, + densepose_chart_predictor_output_hflip, + densepose_chart_predictor_output_to_result, + densepose_chart_predictor_output_to_result_with_confidences, + predictor_output_with_coarse_segm_to_mask, + predictor_output_with_fine_and_coarse_segm_to_mask, +) + +ToMaskConverter.register( + DensePoseChartPredictorOutput, predictor_output_with_fine_and_coarse_segm_to_mask +) +ToMaskConverter.register( + DensePoseEmbeddingPredictorOutput, predictor_output_with_coarse_segm_to_mask +) + +ToChartResultConverter.register( + DensePoseChartPredictorOutput, densepose_chart_predictor_output_to_result +) + +ToChartResultConverterWithConfidences.register( + DensePoseChartPredictorOutput, densepose_chart_predictor_output_to_result_with_confidences +) + +HFlipConverter.register(DensePoseChartPredictorOutput, densepose_chart_predictor_output_hflip) diff --git a/Leffa/densepose/converters/chart_output_hflip.py b/Leffa/densepose/converters/chart_output_hflip.py new file mode 100644 index 0000000000000000000000000000000000000000..17d294841264c248cf7fa9e3d2d2b4efdbb9a5e8 --- /dev/null +++ b/Leffa/densepose/converters/chart_output_hflip.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from dataclasses import fields +import torch + +from densepose.structures import DensePoseChartPredictorOutput, DensePoseTransformData + + +def densepose_chart_predictor_output_hflip( + densepose_predictor_output: DensePoseChartPredictorOutput, + transform_data: DensePoseTransformData, +) -> DensePoseChartPredictorOutput: + """ + Change to take into account a Horizontal flip. + """ + if len(densepose_predictor_output) > 0: + + PredictorOutput = type(densepose_predictor_output) + output_dict = {} + + for field in fields(densepose_predictor_output): + field_value = getattr(densepose_predictor_output, field.name) + # flip tensors + if isinstance(field_value, torch.Tensor): + setattr(densepose_predictor_output, field.name, torch.flip(field_value, [3])) + + densepose_predictor_output = _flip_iuv_semantics_tensor( + densepose_predictor_output, transform_data + ) + densepose_predictor_output = _flip_segm_semantics_tensor( + densepose_predictor_output, transform_data + ) + + for field in fields(densepose_predictor_output): + output_dict[field.name] = getattr(densepose_predictor_output, field.name) + + return PredictorOutput(**output_dict) + else: + return densepose_predictor_output + + +def _flip_iuv_semantics_tensor( + densepose_predictor_output: DensePoseChartPredictorOutput, + dp_transform_data: DensePoseTransformData, +) -> DensePoseChartPredictorOutput: + point_label_symmetries = dp_transform_data.point_label_symmetries + uv_symmetries = dp_transform_data.uv_symmetries + + N, C, H, W = densepose_predictor_output.u.shape + u_loc = (densepose_predictor_output.u[:, 1:, :, :].clamp(0, 1) * 255).long() + v_loc = (densepose_predictor_output.v[:, 1:, :, :].clamp(0, 1) * 255).long() + Iindex = torch.arange(C - 1, device=densepose_predictor_output.u.device)[ + None, :, None, None + ].expand(N, C - 1, H, W) + densepose_predictor_output.u[:, 1:, :, :] = uv_symmetries["U_transforms"][Iindex, v_loc, u_loc] + densepose_predictor_output.v[:, 1:, :, :] = uv_symmetries["V_transforms"][Iindex, v_loc, u_loc] + + for el in ["fine_segm", "u", "v"]: + densepose_predictor_output.__dict__[el] = densepose_predictor_output.__dict__[el][ + :, point_label_symmetries, :, : + ] + return densepose_predictor_output + + +def _flip_segm_semantics_tensor( + densepose_predictor_output: DensePoseChartPredictorOutput, dp_transform_data +): + if densepose_predictor_output.coarse_segm.shape[1] > 2: + densepose_predictor_output.coarse_segm = densepose_predictor_output.coarse_segm[ + :, dp_transform_data.mask_label_symmetries, :, : + ] + return densepose_predictor_output diff --git a/Leffa/densepose/converters/chart_output_to_chart_result.py b/Leffa/densepose/converters/chart_output_to_chart_result.py new file mode 100644 index 0000000000000000000000000000000000000000..4248f6c91b641a4ad1d00d0316ee82d701f9152f --- /dev/null +++ b/Leffa/densepose/converters/chart_output_to_chart_result.py @@ -0,0 +1,188 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Dict +import torch +from torch.nn import functional as F + +from detectron2.structures.boxes import Boxes, BoxMode + +from ..structures import ( + DensePoseChartPredictorOutput, + DensePoseChartResult, + DensePoseChartResultWithConfidences, +) +from . import resample_fine_and_coarse_segm_to_bbox +from .base import IntTupleBox, make_int_box + + +def resample_uv_tensors_to_bbox( + u: torch.Tensor, + v: torch.Tensor, + labels: torch.Tensor, + box_xywh_abs: IntTupleBox, +) -> torch.Tensor: + """ + Resamples U and V coordinate estimates for the given bounding box + + Args: + u (tensor [1, C, H, W] of float): U coordinates + v (tensor [1, C, H, W] of float): V coordinates + labels (tensor [H, W] of long): labels obtained by resampling segmentation + outputs for the given bounding box + box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs + Return: + Resampled U and V coordinates - a tensor [2, H, W] of float + """ + x, y, w, h = box_xywh_abs + w = max(int(w), 1) + h = max(int(h), 1) + u_bbox = F.interpolate(u, (h, w), mode="bilinear", align_corners=False) + v_bbox = F.interpolate(v, (h, w), mode="bilinear", align_corners=False) + uv = torch.zeros([2, h, w], dtype=torch.float32, device=u.device) + for part_id in range(1, u_bbox.size(1)): + uv[0][labels == part_id] = u_bbox[0, part_id][labels == part_id] + uv[1][labels == part_id] = v_bbox[0, part_id][labels == part_id] + return uv + + +def resample_uv_to_bbox( + predictor_output: DensePoseChartPredictorOutput, + labels: torch.Tensor, + box_xywh_abs: IntTupleBox, +) -> torch.Tensor: + """ + Resamples U and V coordinate estimates for the given bounding box + + Args: + predictor_output (DensePoseChartPredictorOutput): DensePose predictor + output to be resampled + labels (tensor [H, W] of long): labels obtained by resampling segmentation + outputs for the given bounding box + box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs + Return: + Resampled U and V coordinates - a tensor [2, H, W] of float + """ + return resample_uv_tensors_to_bbox( + predictor_output.u, + predictor_output.v, + labels, + box_xywh_abs, + ) + + +def densepose_chart_predictor_output_to_result( + predictor_output: DensePoseChartPredictorOutput, boxes: Boxes +) -> DensePoseChartResult: + """ + Convert densepose chart predictor outputs to results + + Args: + predictor_output (DensePoseChartPredictorOutput): DensePose predictor + output to be converted to results, must contain only 1 output + boxes (Boxes): bounding box that corresponds to the predictor output, + must contain only 1 bounding box + Return: + DensePose chart-based result (DensePoseChartResult) + """ + assert len(predictor_output) == 1 and len(boxes) == 1, ( + f"Predictor output to result conversion can operate only single outputs" + f", got {len(predictor_output)} predictor outputs and {len(boxes)} boxes" + ) + + boxes_xyxy_abs = boxes.tensor.clone() + boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + box_xywh = make_int_box(boxes_xywh_abs[0]) + + labels = resample_fine_and_coarse_segm_to_bbox(predictor_output, box_xywh).squeeze(0) + uv = resample_uv_to_bbox(predictor_output, labels, box_xywh) + return DensePoseChartResult(labels=labels, uv=uv) + + +def resample_confidences_to_bbox( + predictor_output: DensePoseChartPredictorOutput, + labels: torch.Tensor, + box_xywh_abs: IntTupleBox, +) -> Dict[str, torch.Tensor]: + """ + Resamples confidences for the given bounding box + + Args: + predictor_output (DensePoseChartPredictorOutput): DensePose predictor + output to be resampled + labels (tensor [H, W] of long): labels obtained by resampling segmentation + outputs for the given bounding box + box_xywh_abs (tuple of 4 int): bounding box that corresponds to predictor outputs + Return: + Resampled confidences - a dict of [H, W] tensors of float + """ + + x, y, w, h = box_xywh_abs + w = max(int(w), 1) + h = max(int(h), 1) + + confidence_names = [ + "sigma_1", + "sigma_2", + "kappa_u", + "kappa_v", + "fine_segm_confidence", + "coarse_segm_confidence", + ] + confidence_results = {key: None for key in confidence_names} + confidence_names = [ + key for key in confidence_names if getattr(predictor_output, key) is not None + ] + confidence_base = torch.zeros([h, w], dtype=torch.float32, device=predictor_output.u.device) + + # assign data from channels that correspond to the labels + for key in confidence_names: + resampled_confidence = F.interpolate( + getattr(predictor_output, key), + (h, w), + mode="bilinear", + align_corners=False, + ) + result = confidence_base.clone() + for part_id in range(1, predictor_output.u.size(1)): + if resampled_confidence.size(1) != predictor_output.u.size(1): + # confidence is not part-based, don't try to fill it part by part + continue + result[labels == part_id] = resampled_confidence[0, part_id][labels == part_id] + + if resampled_confidence.size(1) != predictor_output.u.size(1): + # confidence is not part-based, fill the data with the first channel + # (targeted for segmentation confidences that have only 1 channel) + result = resampled_confidence[0, 0] + + confidence_results[key] = result + + return confidence_results # pyre-ignore[7] + + +def densepose_chart_predictor_output_to_result_with_confidences( + predictor_output: DensePoseChartPredictorOutput, boxes: Boxes +) -> DensePoseChartResultWithConfidences: + """ + Convert densepose chart predictor outputs to results + + Args: + predictor_output (DensePoseChartPredictorOutput): DensePose predictor + output with confidences to be converted to results, must contain only 1 output + boxes (Boxes): bounding box that corresponds to the predictor output, + must contain only 1 bounding box + Return: + DensePose chart-based result with confidences (DensePoseChartResultWithConfidences) + """ + assert len(predictor_output) == 1 and len(boxes) == 1, ( + f"Predictor output to result conversion can operate only single outputs" + f", got {len(predictor_output)} predictor outputs and {len(boxes)} boxes" + ) + + boxes_xyxy_abs = boxes.tensor.clone() + boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + box_xywh = make_int_box(boxes_xywh_abs[0]) + + labels = resample_fine_and_coarse_segm_to_bbox(predictor_output, box_xywh).squeeze(0) + uv = resample_uv_to_bbox(predictor_output, labels, box_xywh) + confidences = resample_confidences_to_bbox(predictor_output, labels, box_xywh) + return DensePoseChartResultWithConfidences(labels=labels, uv=uv, **confidences) diff --git a/Leffa/densepose/converters/hflip.py b/Leffa/densepose/converters/hflip.py new file mode 100644 index 0000000000000000000000000000000000000000..6df144280b2b84308acbb607e3313d0992faa68c --- /dev/null +++ b/Leffa/densepose/converters/hflip.py @@ -0,0 +1,34 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any + +from .base import BaseConverter + + +class HFlipConverter(BaseConverter): + """ + Converts various DensePose predictor outputs to DensePose results. + Each DensePose predictor output type has to register its convertion strategy. + """ + + registry = {} + dst_type = None + + @classmethod + # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter` + # inconsistently. + def convert(cls, predictor_outputs: Any, transform_data: Any, *args, **kwargs): + """ + Performs an horizontal flip on DensePose predictor outputs. + Does recursive lookup for base classes, so there's no need + for explicit registration for derived classes. + + Args: + predictor_outputs: DensePose predictor output to be converted to BitMasks + transform_data: Anything useful for the flip + Return: + An instance of the same type as predictor_outputs + """ + return super(HFlipConverter, cls).convert( + predictor_outputs, transform_data, *args, **kwargs + ) diff --git a/Leffa/densepose/converters/segm_to_mask.py b/Leffa/densepose/converters/segm_to_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..6433d5dec75c3d6141252af144b61d8999077bb7 --- /dev/null +++ b/Leffa/densepose/converters/segm_to_mask.py @@ -0,0 +1,150 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any +import torch +from torch.nn import functional as F + +from detectron2.structures import BitMasks, Boxes, BoxMode + +from .base import IntTupleBox, make_int_box +from .to_mask import ImageSizeType + + +def resample_coarse_segm_tensor_to_bbox(coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox): + """ + Resample coarse segmentation tensor to the given + bounding box and derive labels for each pixel of the bounding box + + Args: + coarse_segm: float tensor of shape [1, K, Hout, Wout] + box_xywh_abs (tuple of 4 int): bounding box given by its upper-left + corner coordinates, width (W) and height (H) + Return: + Labels for each pixel of the bounding box, a long tensor of size [1, H, W] + """ + x, y, w, h = box_xywh_abs + w = max(int(w), 1) + h = max(int(h), 1) + labels = F.interpolate(coarse_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1) + return labels + + +def resample_fine_and_coarse_segm_tensors_to_bbox( + fine_segm: torch.Tensor, coarse_segm: torch.Tensor, box_xywh_abs: IntTupleBox +): + """ + Resample fine and coarse segmentation tensors to the given + bounding box and derive labels for each pixel of the bounding box + + Args: + fine_segm: float tensor of shape [1, C, Hout, Wout] + coarse_segm: float tensor of shape [1, K, Hout, Wout] + box_xywh_abs (tuple of 4 int): bounding box given by its upper-left + corner coordinates, width (W) and height (H) + Return: + Labels for each pixel of the bounding box, a long tensor of size [1, H, W] + """ + x, y, w, h = box_xywh_abs + w = max(int(w), 1) + h = max(int(h), 1) + # coarse segmentation + coarse_segm_bbox = F.interpolate( + coarse_segm, + (h, w), + mode="bilinear", + align_corners=False, + ).argmax(dim=1) + # combined coarse and fine segmentation + labels = ( + F.interpolate(fine_segm, (h, w), mode="bilinear", align_corners=False).argmax(dim=1) + * (coarse_segm_bbox > 0).long() + ) + return labels + + +def resample_fine_and_coarse_segm_to_bbox(predictor_output: Any, box_xywh_abs: IntTupleBox): + """ + Resample fine and coarse segmentation outputs from a predictor to the given + bounding box and derive labels for each pixel of the bounding box + + Args: + predictor_output: DensePose predictor output that contains segmentation + results to be resampled + box_xywh_abs (tuple of 4 int): bounding box given by its upper-left + corner coordinates, width (W) and height (H) + Return: + Labels for each pixel of the bounding box, a long tensor of size [1, H, W] + """ + return resample_fine_and_coarse_segm_tensors_to_bbox( + predictor_output.fine_segm, + predictor_output.coarse_segm, + box_xywh_abs, + ) + + +def predictor_output_with_coarse_segm_to_mask( + predictor_output: Any, boxes: Boxes, image_size_hw: ImageSizeType +) -> BitMasks: + """ + Convert predictor output with coarse and fine segmentation to a mask. + Assumes that predictor output has the following attributes: + - coarse_segm (tensor of size [N, D, H, W]): coarse segmentation + unnormalized scores for N instances; D is the number of coarse + segmentation labels, H and W is the resolution of the estimate + + Args: + predictor_output: DensePose predictor output to be converted to mask + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + image_size_hw (tuple [int, int]): image height Himg and width Wimg + Return: + BitMasks that contain a bool tensor of size [N, Himg, Wimg] with + a mask of the size of the image for each instance + """ + H, W = image_size_hw + boxes_xyxy_abs = boxes.tensor.clone() + boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + N = len(boxes_xywh_abs) + masks = torch.zeros((N, H, W), dtype=torch.bool, device=boxes.tensor.device) + for i in range(len(boxes_xywh_abs)): + box_xywh = make_int_box(boxes_xywh_abs[i]) + box_mask = resample_coarse_segm_tensor_to_bbox(predictor_output[i].coarse_segm, box_xywh) + x, y, w, h = box_xywh + masks[i, y : y + h, x : x + w] = box_mask + + return BitMasks(masks) + + +def predictor_output_with_fine_and_coarse_segm_to_mask( + predictor_output: Any, boxes: Boxes, image_size_hw: ImageSizeType +) -> BitMasks: + """ + Convert predictor output with coarse and fine segmentation to a mask. + Assumes that predictor output has the following attributes: + - coarse_segm (tensor of size [N, D, H, W]): coarse segmentation + unnormalized scores for N instances; D is the number of coarse + segmentation labels, H and W is the resolution of the estimate + - fine_segm (tensor of size [N, C, H, W]): fine segmentation + unnormalized scores for N instances; C is the number of fine + segmentation labels, H and W is the resolution of the estimate + + Args: + predictor_output: DensePose predictor output to be converted to mask + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + image_size_hw (tuple [int, int]): image height Himg and width Wimg + Return: + BitMasks that contain a bool tensor of size [N, Himg, Wimg] with + a mask of the size of the image for each instance + """ + H, W = image_size_hw + boxes_xyxy_abs = boxes.tensor.clone() + boxes_xywh_abs = BoxMode.convert(boxes_xyxy_abs, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + N = len(boxes_xywh_abs) + masks = torch.zeros((N, H, W), dtype=torch.bool, device=boxes.tensor.device) + for i in range(len(boxes_xywh_abs)): + box_xywh = make_int_box(boxes_xywh_abs[i]) + labels_i = resample_fine_and_coarse_segm_to_bbox(predictor_output[i], box_xywh) + x, y, w, h = box_xywh + masks[i, y : y + h, x : x + w] = labels_i > 0 + return BitMasks(masks) diff --git a/Leffa/densepose/converters/to_chart_result.py b/Leffa/densepose/converters/to_chart_result.py new file mode 100644 index 0000000000000000000000000000000000000000..3eabd2614c285e8ea39d241b73f0d4b5762e6baa --- /dev/null +++ b/Leffa/densepose/converters/to_chart_result.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any + +from detectron2.structures import Boxes + +from ..structures import DensePoseChartResult, DensePoseChartResultWithConfidences +from .base import BaseConverter + + +class ToChartResultConverter(BaseConverter): + """ + Converts various DensePose predictor outputs to DensePose results. + Each DensePose predictor output type has to register its convertion strategy. + """ + + registry = {} + dst_type = DensePoseChartResult + + @classmethod + # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter` + # inconsistently. + def convert(cls, predictor_outputs: Any, boxes: Boxes, *args, **kwargs) -> DensePoseChartResult: + """ + Convert DensePose predictor outputs to DensePoseResult using some registered + converter. Does recursive lookup for base classes, so there's no need + for explicit registration for derived classes. + + Args: + densepose_predictor_outputs: DensePose predictor output to be + converted to BitMasks + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + Return: + An instance of DensePoseResult. If no suitable converter was found, raises KeyError + """ + return super(ToChartResultConverter, cls).convert(predictor_outputs, boxes, *args, **kwargs) + + +class ToChartResultConverterWithConfidences(BaseConverter): + """ + Converts various DensePose predictor outputs to DensePose results. + Each DensePose predictor output type has to register its convertion strategy. + """ + + registry = {} + dst_type = DensePoseChartResultWithConfidences + + @classmethod + # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter` + # inconsistently. + def convert( + cls, predictor_outputs: Any, boxes: Boxes, *args, **kwargs + ) -> DensePoseChartResultWithConfidences: + """ + Convert DensePose predictor outputs to DensePoseResult with confidences + using some registered converter. Does recursive lookup for base classes, + so there's no need for explicit registration for derived classes. + + Args: + densepose_predictor_outputs: DensePose predictor output with confidences + to be converted to BitMasks + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + Return: + An instance of DensePoseResult. If no suitable converter was found, raises KeyError + """ + return super(ToChartResultConverterWithConfidences, cls).convert( + predictor_outputs, boxes, *args, **kwargs + ) diff --git a/Leffa/densepose/converters/to_mask.py b/Leffa/densepose/converters/to_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..a57fd71afc448a7d269a8a38c2014b14c8c5074f --- /dev/null +++ b/Leffa/densepose/converters/to_mask.py @@ -0,0 +1,49 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any, Tuple + +from detectron2.structures import BitMasks, Boxes + +from .base import BaseConverter + +ImageSizeType = Tuple[int, int] + + +class ToMaskConverter(BaseConverter): + """ + Converts various DensePose predictor outputs to masks + in bit mask format (see `BitMasks`). Each DensePose predictor output type + has to register its convertion strategy. + """ + + registry = {} + dst_type = BitMasks + + @classmethod + # pyre-fixme[14]: `convert` overrides method defined in `BaseConverter` + # inconsistently. + def convert( + cls, + densepose_predictor_outputs: Any, + boxes: Boxes, + image_size_hw: ImageSizeType, + *args, + **kwargs + ) -> BitMasks: + """ + Convert DensePose predictor outputs to BitMasks using some registered + converter. Does recursive lookup for base classes, so there's no need + for explicit registration for derived classes. + + Args: + densepose_predictor_outputs: DensePose predictor output to be + converted to BitMasks + boxes (Boxes): bounding boxes that correspond to the DensePose + predictor outputs + image_size_hw (tuple [int, int]): image height and width + Return: + An instance of `BitMasks`. If no suitable converter was found, raises KeyError + """ + return super(ToMaskConverter, cls).convert( + densepose_predictor_outputs, boxes, image_size_hw, *args, **kwargs + ) diff --git a/Leffa/densepose/engine/__init__.py b/Leffa/densepose/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..539b93a7beca07d229a6b6d387f885469242ad86 --- /dev/null +++ b/Leffa/densepose/engine/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .trainer import Trainer diff --git a/Leffa/densepose/engine/trainer.py b/Leffa/densepose/engine/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..a8ffe82c3d64d01ae36bb3c07cc6d75950937389 --- /dev/null +++ b/Leffa/densepose/engine/trainer.py @@ -0,0 +1,258 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import logging +import os +from collections import OrderedDict +from typing import List, Optional, Union +import torch +from torch import nn + +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import CfgNode +from detectron2.engine import DefaultTrainer +from detectron2.evaluation import ( + DatasetEvaluator, + DatasetEvaluators, + inference_on_dataset, + print_csv_format, +) +from detectron2.solver.build import get_default_optimizer_params, maybe_add_gradient_clipping +from detectron2.utils import comm +from detectron2.utils.events import EventWriter, get_event_storage + +from densepose import DensePoseDatasetMapperTTA, DensePoseGeneralizedRCNNWithTTA, load_from_cfg +from densepose.data import ( + DatasetMapper, + build_combined_loader, + build_detection_test_loader, + build_detection_train_loader, + build_inference_based_loaders, + has_inference_based_loaders, +) +from densepose.evaluation.d2_evaluator_adapter import Detectron2COCOEvaluatorAdapter +from densepose.evaluation.evaluator import DensePoseCOCOEvaluator, build_densepose_evaluator_storage +from densepose.modeling.cse import Embedder + + +class SampleCountingLoader: + def __init__(self, loader): + self.loader = loader + + def __iter__(self): + it = iter(self.loader) + storage = get_event_storage() + while True: + try: + batch = next(it) + num_inst_per_dataset = {} + for data in batch: + dataset_name = data["dataset"] + if dataset_name not in num_inst_per_dataset: + num_inst_per_dataset[dataset_name] = 0 + num_inst = len(data["instances"]) + num_inst_per_dataset[dataset_name] += num_inst + for dataset_name in num_inst_per_dataset: + storage.put_scalar(f"batch/{dataset_name}", num_inst_per_dataset[dataset_name]) + yield batch + except StopIteration: + break + + +class SampleCountMetricPrinter(EventWriter): + def __init__(self): + self.logger = logging.getLogger(__name__) + + def write(self): + storage = get_event_storage() + batch_stats_strs = [] + for key, buf in storage.histories().items(): + if key.startswith("batch/"): + batch_stats_strs.append(f"{key} {buf.avg(20)}") + self.logger.info(", ".join(batch_stats_strs)) + + +class Trainer(DefaultTrainer): + @classmethod + def extract_embedder_from_model(cls, model: nn.Module) -> Optional[Embedder]: + if isinstance(model, nn.parallel.DistributedDataParallel): + model = model.module + if hasattr(model, "roi_heads") and hasattr(model.roi_heads, "embedder"): + return model.roi_heads.embedder + return None + + # TODO: the only reason to copy the base class code here is to pass the embedder from + # the model to the evaluator; that should be refactored to avoid unnecessary copy-pasting + @classmethod + def test( + cls, + cfg: CfgNode, + model: nn.Module, + evaluators: Optional[Union[DatasetEvaluator, List[DatasetEvaluator]]] = None, + ): + """ + Args: + cfg (CfgNode): + model (nn.Module): + evaluators (DatasetEvaluator, list[DatasetEvaluator] or None): if None, will call + :meth:`build_evaluator`. Otherwise, must have the same length as + ``cfg.DATASETS.TEST``. + + Returns: + dict: a dict of result metrics + """ + logger = logging.getLogger(__name__) + if isinstance(evaluators, DatasetEvaluator): + evaluators = [evaluators] + if evaluators is not None: + assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( + len(cfg.DATASETS.TEST), len(evaluators) + ) + + results = OrderedDict() + for idx, dataset_name in enumerate(cfg.DATASETS.TEST): + data_loader = cls.build_test_loader(cfg, dataset_name) + # When evaluators are passed in as arguments, + # implicitly assume that evaluators can be created before data_loader. + if evaluators is not None: + evaluator = evaluators[idx] + else: + try: + embedder = cls.extract_embedder_from_model(model) + evaluator = cls.build_evaluator(cfg, dataset_name, embedder=embedder) + except NotImplementedError: + logger.warn( + "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " + "or implement its `build_evaluator` method." + ) + results[dataset_name] = {} + continue + if cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE or comm.is_main_process(): + results_i = inference_on_dataset(model, data_loader, evaluator) + else: + results_i = {} + results[dataset_name] = results_i + if comm.is_main_process(): + assert isinstance( + results_i, dict + ), "Evaluator must return a dict on the main process. Got {} instead.".format( + results_i + ) + logger.info("Evaluation results for {} in csv format:".format(dataset_name)) + print_csv_format(results_i) + + if len(results) == 1: + results = list(results.values())[0] + return results + + @classmethod + def build_evaluator( + cls, + cfg: CfgNode, + dataset_name: str, + output_folder: Optional[str] = None, + embedder: Optional[Embedder] = None, + ) -> DatasetEvaluators: + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + evaluators = [] + distributed = cfg.DENSEPOSE_EVALUATION.DISTRIBUTED_INFERENCE + # Note: we currently use COCO evaluator for both COCO and LVIS datasets + # to have compatible metrics. LVIS bbox evaluator could also be used + # with an adapter to properly handle filtered / mapped categories + # evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type + # if evaluator_type == "coco": + # evaluators.append(COCOEvaluator(dataset_name, output_dir=output_folder)) + # elif evaluator_type == "lvis": + # evaluators.append(LVISEvaluator(dataset_name, output_dir=output_folder)) + evaluators.append( + Detectron2COCOEvaluatorAdapter( + dataset_name, output_dir=output_folder, distributed=distributed + ) + ) + if cfg.MODEL.DENSEPOSE_ON: + storage = build_densepose_evaluator_storage(cfg, output_folder) + evaluators.append( + DensePoseCOCOEvaluator( + dataset_name, + distributed, + output_folder, + evaluator_type=cfg.DENSEPOSE_EVALUATION.TYPE, + min_iou_threshold=cfg.DENSEPOSE_EVALUATION.MIN_IOU_THRESHOLD, + storage=storage, + embedder=embedder, + should_evaluate_mesh_alignment=cfg.DENSEPOSE_EVALUATION.EVALUATE_MESH_ALIGNMENT, + mesh_alignment_mesh_names=cfg.DENSEPOSE_EVALUATION.MESH_ALIGNMENT_MESH_NAMES, + ) + ) + return DatasetEvaluators(evaluators) + + @classmethod + def build_optimizer(cls, cfg: CfgNode, model: nn.Module): + params = get_default_optimizer_params( + model, + base_lr=cfg.SOLVER.BASE_LR, + weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM, + bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR, + weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS, + overrides={ + "features": { + "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.FEATURES_LR_FACTOR, + }, + "embeddings": { + "lr": cfg.SOLVER.BASE_LR * cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDING_LR_FACTOR, + }, + }, + ) + optimizer = torch.optim.SGD( + params, + cfg.SOLVER.BASE_LR, + momentum=cfg.SOLVER.MOMENTUM, + nesterov=cfg.SOLVER.NESTEROV, + weight_decay=cfg.SOLVER.WEIGHT_DECAY, + ) + # pyre-fixme[6]: For 2nd param expected `Type[Optimizer]` but got `SGD`. + return maybe_add_gradient_clipping(cfg, optimizer) + + @classmethod + def build_test_loader(cls, cfg: CfgNode, dataset_name): + return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False)) + + @classmethod + def build_train_loader(cls, cfg: CfgNode): + data_loader = build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True)) + if not has_inference_based_loaders(cfg): + return data_loader + model = cls.build_model(cfg) + model.to(cfg.BOOTSTRAP_MODEL.DEVICE) + DetectionCheckpointer(model).resume_or_load(cfg.BOOTSTRAP_MODEL.WEIGHTS, resume=False) + inference_based_loaders, ratios = build_inference_based_loaders(cfg, model) + loaders = [data_loader] + inference_based_loaders + ratios = [1.0] + ratios + combined_data_loader = build_combined_loader(cfg, loaders, ratios) + sample_counting_loader = SampleCountingLoader(combined_data_loader) + return sample_counting_loader + + def build_writers(self): + writers = super().build_writers() + writers.append(SampleCountMetricPrinter()) + return writers + + @classmethod + def test_with_TTA(cls, cfg: CfgNode, model): + logger = logging.getLogger("detectron2.trainer") + # In the end of training, run an evaluation with TTA + # Only support some R-CNN models. + logger.info("Running inference with test-time augmentation ...") + transform_data = load_from_cfg(cfg) + model = DensePoseGeneralizedRCNNWithTTA( + cfg, model, transform_data, DensePoseDatasetMapperTTA(cfg) + ) + evaluators = [ + cls.build_evaluator( + cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") + ) + for name in cfg.DATASETS.TEST + ] + res = cls.test(cfg, model, evaluators) # pyre-ignore[6] + res = OrderedDict({k + "_TTA": v for k, v in res.items()}) + return res diff --git a/Leffa/densepose/modeling/__init__.py b/Leffa/densepose/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4c49f6da0d182cc97f5fe6b21d77c8f8330d3c3d --- /dev/null +++ b/Leffa/densepose/modeling/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType +from .filter import DensePoseDataFilter +from .inference import densepose_inference +from .utils import initialize_module_params +from .build import ( + build_densepose_data_filter, + build_densepose_embedder, + build_densepose_head, + build_densepose_losses, + build_densepose_predictor, +) diff --git a/Leffa/densepose/modeling/build.py b/Leffa/densepose/modeling/build.py new file mode 100644 index 0000000000000000000000000000000000000000..bb7f54b4a1044bc518d66d89432dd52c79fdf293 --- /dev/null +++ b/Leffa/densepose/modeling/build.py @@ -0,0 +1,87 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Optional +from torch import nn + +from detectron2.config import CfgNode + +from .cse.embedder import Embedder +from .filter import DensePoseDataFilter + + +def build_densepose_predictor(cfg: CfgNode, input_channels: int): + """ + Create an instance of DensePose predictor based on configuration options. + + Args: + cfg (CfgNode): configuration options + input_channels (int): input tensor size along the channel dimension + Return: + An instance of DensePose predictor + """ + from .predictors import DENSEPOSE_PREDICTOR_REGISTRY + + predictor_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.PREDICTOR_NAME + return DENSEPOSE_PREDICTOR_REGISTRY.get(predictor_name)(cfg, input_channels) + + +def build_densepose_data_filter(cfg: CfgNode): + """ + Build DensePose data filter which selects data for training + + Args: + cfg (CfgNode): configuration options + + Return: + Callable: list(Tensor), list(Instances) -> list(Tensor), list(Instances) + An instance of DensePose filter, which takes feature tensors and proposals + as an input and returns filtered features and proposals + """ + dp_filter = DensePoseDataFilter(cfg) + return dp_filter + + +def build_densepose_head(cfg: CfgNode, input_channels: int): + """ + Build DensePose head based on configurations options + + Args: + cfg (CfgNode): configuration options + input_channels (int): input tensor size along the channel dimension + Return: + An instance of DensePose head + """ + from .roi_heads.registry import ROI_DENSEPOSE_HEAD_REGISTRY + + head_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.NAME + return ROI_DENSEPOSE_HEAD_REGISTRY.get(head_name)(cfg, input_channels) + + +def build_densepose_losses(cfg: CfgNode): + """ + Build DensePose loss based on configurations options + + Args: + cfg (CfgNode): configuration options + Return: + An instance of DensePose loss + """ + from .losses import DENSEPOSE_LOSS_REGISTRY + + loss_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.LOSS_NAME + return DENSEPOSE_LOSS_REGISTRY.get(loss_name)(cfg) + + +def build_densepose_embedder(cfg: CfgNode) -> Optional[nn.Module]: + """ + Build embedder used to embed mesh vertices into an embedding space. + Embedder contains sub-embedders, one for each mesh ID. + + Args: + cfg (cfgNode): configuration options + Return: + Embedding module + """ + if cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBEDDERS: + return Embedder(cfg) + return None diff --git a/Leffa/densepose/modeling/confidence.py b/Leffa/densepose/modeling/confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..6f4a72efec06e055036ba70bc75b2624d20e1e0e --- /dev/null +++ b/Leffa/densepose/modeling/confidence.py @@ -0,0 +1,73 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from dataclasses import dataclass +from enum import Enum + +from detectron2.config import CfgNode + + +class DensePoseUVConfidenceType(Enum): + """ + Statistical model type for confidence learning, possible values: + - "iid_iso": statistically independent identically distributed residuals + with anisotropic covariance + - "indep_aniso": statistically independent residuals with anisotropic + covariances + For details, see: + N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning + Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019 + """ + + # fmt: off + IID_ISO = "iid_iso" + INDEP_ANISO = "indep_aniso" + # fmt: on + + +@dataclass +class DensePoseUVConfidenceConfig: + """ + Configuration options for confidence on UV data + """ + + enabled: bool = False + # lower bound on UV confidences + epsilon: float = 0.01 + type: DensePoseUVConfidenceType = DensePoseUVConfidenceType.IID_ISO + + +@dataclass +class DensePoseSegmConfidenceConfig: + """ + Configuration options for confidence on segmentation + """ + + enabled: bool = False + # lower bound on confidence values + epsilon: float = 0.01 + + +@dataclass +class DensePoseConfidenceModelConfig: + """ + Configuration options for confidence models + """ + + # confidence for U and V values + uv_confidence: DensePoseUVConfidenceConfig + # segmentation confidence + segm_confidence: DensePoseSegmConfidenceConfig + + @staticmethod + def from_cfg(cfg: CfgNode) -> "DensePoseConfidenceModelConfig": + return DensePoseConfidenceModelConfig( + uv_confidence=DensePoseUVConfidenceConfig( + enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.ENABLED, + epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON, + type=DensePoseUVConfidenceType(cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE), + ), + segm_confidence=DensePoseSegmConfidenceConfig( + enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.ENABLED, + epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.SEGM_CONFIDENCE.EPSILON, + ), + ) diff --git a/Leffa/densepose/modeling/densepose_checkpoint.py b/Leffa/densepose/modeling/densepose_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..8c2b4f2e2cc9c6c798cf1bdb9c38dedc84058bd5 --- /dev/null +++ b/Leffa/densepose/modeling/densepose_checkpoint.py @@ -0,0 +1,35 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from collections import OrderedDict + +from detectron2.checkpoint import DetectionCheckpointer + + +def _rename_HRNet_weights(weights): + # We detect and rename HRNet weights for DensePose. 1956 and 1716 are values that are + # common to all HRNet pretrained weights, and should be enough to accurately identify them + if ( + len(weights["model"].keys()) == 1956 + and len([k for k in weights["model"].keys() if k.startswith("stage")]) == 1716 + ): + hrnet_weights = OrderedDict() + for k in weights["model"].keys(): + hrnet_weights["backbone.bottom_up." + str(k)] = weights["model"][k] + return {"model": hrnet_weights} + else: + return weights + + +class DensePoseCheckpointer(DetectionCheckpointer): + """ + Same as :class:`DetectionCheckpointer`, but is able to handle HRNet weights + """ + + def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables): + super().__init__(model, save_dir, save_to_disk=save_to_disk, **checkpointables) + + def _load_file(self, filename: str) -> object: + """ + Adding hrnet support + """ + weights = super()._load_file(filename) + return _rename_HRNet_weights(weights) diff --git a/Leffa/densepose/modeling/filter.py b/Leffa/densepose/modeling/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..4682b225dbba1ce330c8f4ed6ad14dafcc935e5c --- /dev/null +++ b/Leffa/densepose/modeling/filter.py @@ -0,0 +1,94 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import List +import torch + +from detectron2.config import CfgNode +from detectron2.structures import Instances +from detectron2.structures.boxes import matched_pairwise_iou + + +class DensePoseDataFilter: + def __init__(self, cfg: CfgNode): + self.iou_threshold = cfg.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD + self.keep_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS + + @torch.no_grad() + def __call__(self, features: List[torch.Tensor], proposals_with_targets: List[Instances]): + """ + Filters proposals with targets to keep only the ones relevant for + DensePose training + + Args: + features (list[Tensor]): input data as a list of features, + each feature is a tensor. Axis 0 represents the number of + images `N` in the input data; axes 1-3 are channels, + height, and width, which may vary between features + (e.g., if a feature pyramid is used). + proposals_with_targets (list[Instances]): length `N` list of + `Instances`. The i-th `Instances` contains instances + (proposals, GT) for the i-th input image, + Returns: + list[Tensor]: filtered features + list[Instances]: filtered proposals + """ + proposals_filtered = [] + # TODO: the commented out code was supposed to correctly deal with situations + # where no valid DensePose GT is available for certain images. The corresponding + # image features were sliced and proposals were filtered. This led to performance + # deterioration, both in terms of runtime and in terms of evaluation results. + # + # feature_mask = torch.ones( + # len(proposals_with_targets), + # dtype=torch.bool, + # device=features[0].device if len(features) > 0 else torch.device("cpu"), + # ) + for i, proposals_per_image in enumerate(proposals_with_targets): + if not proposals_per_image.has("gt_densepose") and ( + not proposals_per_image.has("gt_masks") or not self.keep_masks + ): + # feature_mask[i] = 0 + continue + gt_boxes = proposals_per_image.gt_boxes + est_boxes = proposals_per_image.proposal_boxes + # apply match threshold for densepose head + iou = matched_pairwise_iou(gt_boxes, est_boxes) + iou_select = iou > self.iou_threshold + proposals_per_image = proposals_per_image[iou_select] # pyre-ignore[6] + + N_gt_boxes = len(proposals_per_image.gt_boxes) + assert N_gt_boxes == len(proposals_per_image.proposal_boxes), ( + f"The number of GT boxes {N_gt_boxes} is different from the " + f"number of proposal boxes {len(proposals_per_image.proposal_boxes)}" + ) + # filter out any target without suitable annotation + if self.keep_masks: + gt_masks = ( + proposals_per_image.gt_masks + if hasattr(proposals_per_image, "gt_masks") + else [None] * N_gt_boxes + ) + else: + gt_masks = [None] * N_gt_boxes + gt_densepose = ( + proposals_per_image.gt_densepose + if hasattr(proposals_per_image, "gt_densepose") + else [None] * N_gt_boxes + ) + assert len(gt_masks) == N_gt_boxes + assert len(gt_densepose) == N_gt_boxes + selected_indices = [ + i + for i, (dp_target, mask_target) in enumerate(zip(gt_densepose, gt_masks)) + if (dp_target is not None) or (mask_target is not None) + ] + # if not len(selected_indices): + # feature_mask[i] = 0 + # continue + if len(selected_indices) != N_gt_boxes: + proposals_per_image = proposals_per_image[selected_indices] # pyre-ignore[6] + assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes) + proposals_filtered.append(proposals_per_image) + # features_filtered = [feature[feature_mask] for feature in features] + # return features_filtered, proposals_filtered + return features, proposals_filtered diff --git a/Leffa/densepose/modeling/hrfpn.py b/Leffa/densepose/modeling/hrfpn.py new file mode 100644 index 0000000000000000000000000000000000000000..08ec420fa24e1e8f5074baf2e9ae737aff2ab12e --- /dev/null +++ b/Leffa/densepose/modeling/hrfpn.py @@ -0,0 +1,182 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +""" +MIT License +Copyright (c) 2019 Microsoft +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from detectron2.layers import ShapeSpec +from detectron2.modeling.backbone import BACKBONE_REGISTRY +from detectron2.modeling.backbone.backbone import Backbone + +from .hrnet import build_pose_hrnet_backbone + + +class HRFPN(Backbone): + """HRFPN (High Resolution Feature Pyramids) + Transforms outputs of HRNet backbone so they are suitable for the ROI_heads + arXiv: https://arxiv.org/abs/1904.04514 + Adapted from https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/hrfpn.py + Args: + bottom_up: (list) output of HRNet + in_features (list): names of the input features (output of HRNet) + in_channels (list): number of channels for each branch + out_channels (int): output channels of feature pyramids + n_out_features (int): number of output stages + pooling (str): pooling for generating feature pyramids (from {MAX, AVG}) + share_conv (bool): Have one conv per output, or share one with all the outputs + """ + + def __init__( + self, + bottom_up, + in_features, + n_out_features, + in_channels, + out_channels, + pooling="AVG", + share_conv=False, + ): + super(HRFPN, self).__init__() + assert isinstance(in_channels, list) + self.bottom_up = bottom_up + self.in_features = in_features + self.n_out_features = n_out_features + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.share_conv = share_conv + + if self.share_conv: + self.fpn_conv = nn.Conv2d( + in_channels=out_channels, out_channels=out_channels, kernel_size=3, padding=1 + ) + else: + self.fpn_conv = nn.ModuleList() + for _ in range(self.n_out_features): + self.fpn_conv.append( + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + padding=1, + ) + ) + + # Custom change: Replaces a simple bilinear interpolation + self.interp_conv = nn.ModuleList() + for i in range(len(self.in_features)): + self.interp_conv.append( + nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels[i], + out_channels=in_channels[i], + kernel_size=4, + stride=2**i, + padding=0, + output_padding=0, + bias=False, + ), + nn.BatchNorm2d(in_channels[i], momentum=0.1), + nn.ReLU(inplace=True), + ) + ) + + # Custom change: Replaces a couple (reduction conv + pooling) by one conv + self.reduction_pooling_conv = nn.ModuleList() + for i in range(self.n_out_features): + self.reduction_pooling_conv.append( + nn.Sequential( + nn.Conv2d(sum(in_channels), out_channels, kernel_size=2**i, stride=2**i), + nn.BatchNorm2d(out_channels, momentum=0.1), + nn.ReLU(inplace=True), + ) + ) + + if pooling == "MAX": + self.pooling = F.max_pool2d + else: + self.pooling = F.avg_pool2d + + self._out_features = [] + self._out_feature_channels = {} + self._out_feature_strides = {} + + for i in range(self.n_out_features): + self._out_features.append("p%d" % (i + 1)) + self._out_feature_channels.update({self._out_features[-1]: self.out_channels}) + self._out_feature_strides.update({self._out_features[-1]: 2 ** (i + 2)}) + + # default init_weights for conv(msra) and norm in ConvModule + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, a=1) + nn.init.constant_(m.bias, 0) + + def forward(self, inputs): + bottom_up_features = self.bottom_up(inputs) + assert len(bottom_up_features) == len(self.in_features) + inputs = [bottom_up_features[f] for f in self.in_features] + + outs = [] + for i in range(len(inputs)): + outs.append(self.interp_conv[i](inputs[i])) + shape_2 = min(o.shape[2] for o in outs) + shape_3 = min(o.shape[3] for o in outs) + out = torch.cat([o[:, :, :shape_2, :shape_3] for o in outs], dim=1) + outs = [] + for i in range(self.n_out_features): + outs.append(self.reduction_pooling_conv[i](out)) + for i in range(len(outs)): # Make shapes consistent + outs[-1 - i] = outs[-1 - i][ + :, :, : outs[-1].shape[2] * 2**i, : outs[-1].shape[3] * 2**i + ] + outputs = [] + for i in range(len(outs)): + if self.share_conv: + outputs.append(self.fpn_conv(outs[i])) + else: + outputs.append(self.fpn_conv[i](outs[i])) + + assert len(self._out_features) == len(outputs) + return dict(zip(self._out_features, outputs)) + + +@BACKBONE_REGISTRY.register() +def build_hrfpn_backbone(cfg, input_shape: ShapeSpec) -> HRFPN: + + in_channels = cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS + in_features = ["p%d" % (i + 1) for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES)] + n_out_features = len(cfg.MODEL.ROI_HEADS.IN_FEATURES) + out_channels = cfg.MODEL.HRNET.HRFPN.OUT_CHANNELS + hrnet = build_pose_hrnet_backbone(cfg, input_shape) + hrfpn = HRFPN( + hrnet, + in_features, + n_out_features, + in_channels, + out_channels, + pooling="AVG", + share_conv=False, + ) + + return hrfpn diff --git a/Leffa/densepose/modeling/hrnet.py b/Leffa/densepose/modeling/hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..ca2467107e8e5a50167de38ef6827fac646d1245 --- /dev/null +++ b/Leffa/densepose/modeling/hrnet.py @@ -0,0 +1,474 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# ------------------------------------------------------------------------------ +# Copyright (c) Microsoft +# Licensed under the MIT License. +# Written by Bin Xiao (leoxiaobin@gmail.com) +# Modified by Bowen Cheng (bcheng9@illinois.edu) +# Adapted from https://github.com/HRNet/Higher-HRNet-Human-Pose-Estimation/blob/master/lib/models/pose_higher_hrnet.py # noqa +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import, division, print_function +import logging +import torch.nn as nn + +from detectron2.layers import ShapeSpec +from detectron2.modeling.backbone import BACKBONE_REGISTRY +from detectron2.modeling.backbone.backbone import Backbone + +BN_MOMENTUM = 0.1 +logger = logging.getLogger(__name__) + +__all__ = ["build_pose_hrnet_backbone", "PoseHigherResolutionNet"] + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) + self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class HighResolutionModule(nn.Module): + """HighResolutionModule + Building block of the PoseHigherResolutionNet (see lower) + arXiv: https://arxiv.org/abs/1908.10357 + Args: + num_branches (int): number of branches of the modyle + blocks (str): type of block of the module + num_blocks (int): number of blocks of the module + num_inchannels (int): number of input channels of the module + num_channels (list): number of channels of each branch + multi_scale_output (bool): only used by the last module of PoseHigherResolutionNet + """ + + def __init__( + self, + num_branches, + blocks, + num_blocks, + num_inchannels, + num_channels, + multi_scale_output=True, + ): + super(HighResolutionModule, self).__init__() + self._check_branches(num_branches, blocks, num_blocks, num_inchannels, num_channels) + + self.num_inchannels = num_inchannels + self.num_branches = num_branches + + self.multi_scale_output = multi_scale_output + + self.branches = self._make_branches(num_branches, blocks, num_blocks, num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(True) + + def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels): + if num_branches != len(num_blocks): + error_msg = "NUM_BRANCHES({}) <> NUM_BLOCKS({})".format(num_branches, len(num_blocks)) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = "NUM_BRANCHES({}) <> NUM_CHANNELS({})".format( + num_branches, len(num_channels) + ) + logger.error(error_msg) + raise ValueError(error_msg) + + if num_branches != len(num_inchannels): + error_msg = "NUM_BRANCHES({}) <> NUM_INCHANNELS({})".format( + num_branches, len(num_inchannels) + ) + logger.error(error_msg) + raise ValueError(error_msg) + + def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1): + downsample = None + if ( + stride != 1 + or self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion + ): + downsample = nn.Sequential( + nn.Conv2d( + self.num_inchannels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + ), + nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append( + block(self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample) + ) + self.num_inchannels[branch_index] = num_channels[branch_index] * block.expansion + for _ in range(1, num_blocks[branch_index]): + layers.append(block(self.num_inchannels[branch_index], num_channels[branch_index])) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + + for i in range(num_branches): + branches.append(self._make_one_branch(i, block, num_blocks, num_channels)) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return None + + num_branches = self.num_branches + num_inchannels = self.num_inchannels + fuse_layers = [] + for i in range(num_branches if self.multi_scale_output else 1): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + nn.Conv2d(num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False), + nn.BatchNorm2d(num_inchannels[i]), + nn.Upsample(scale_factor=2 ** (j - i), mode="nearest"), + ) + ) + elif j == i: + fuse_layer.append(None) + else: + conv3x3s = [] + for k in range(i - j): + if k == i - j - 1: + num_outchannels_conv3x3 = num_inchannels[i] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, + 2, + 1, + bias=False, + ), + nn.BatchNorm2d(num_outchannels_conv3x3), + ) + ) + else: + num_outchannels_conv3x3 = num_inchannels[j] + conv3x3s.append( + nn.Sequential( + nn.Conv2d( + num_inchannels[j], + num_outchannels_conv3x3, + 3, + 2, + 1, + bias=False, + ), + nn.BatchNorm2d(num_outchannels_conv3x3), + nn.ReLU(True), + ) + ) + fuse_layer.append(nn.Sequential(*conv3x3s)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def get_num_inchannels(self): + return self.num_inchannels + + def forward(self, x): + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + + for i in range(len(self.fuse_layers)): + y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) + for j in range(1, self.num_branches): + if i == j: + y = y + x[j] + else: + z = self.fuse_layers[i][j](x[j])[:, :, : y.shape[2], : y.shape[3]] + y = y + z + x_fuse.append(self.relu(y)) + + return x_fuse + + +blocks_dict = {"BASIC": BasicBlock, "BOTTLENECK": Bottleneck} + + +class PoseHigherResolutionNet(Backbone): + """PoseHigherResolutionNet + Composed of several HighResolutionModule tied together with ConvNets + Adapted from the GitHub version to fit with HRFPN and the Detectron2 infrastructure + arXiv: https://arxiv.org/abs/1908.10357 + """ + + def __init__(self, cfg, **kwargs): + self.inplanes = cfg.MODEL.HRNET.STEM_INPLANES + super(PoseHigherResolutionNet, self).__init__() + + # stem net + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) + self.relu = nn.ReLU(inplace=True) + self.layer1 = self._make_layer(Bottleneck, 64, 4) + + self.stage2_cfg = cfg.MODEL.HRNET.STAGE2 + num_channels = self.stage2_cfg.NUM_CHANNELS + block = blocks_dict[self.stage2_cfg.BLOCK] + num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition1 = self._make_transition_layer([256], num_channels) + self.stage2, pre_stage_channels = self._make_stage(self.stage2_cfg, num_channels) + + self.stage3_cfg = cfg.MODEL.HRNET.STAGE3 + num_channels = self.stage3_cfg.NUM_CHANNELS + block = blocks_dict[self.stage3_cfg.BLOCK] + num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels) + self.stage3, pre_stage_channels = self._make_stage(self.stage3_cfg, num_channels) + + self.stage4_cfg = cfg.MODEL.HRNET.STAGE4 + num_channels = self.stage4_cfg.NUM_CHANNELS + block = blocks_dict[self.stage4_cfg.BLOCK] + num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels, multi_scale_output=True + ) + + self._out_features = [] + self._out_feature_channels = {} + self._out_feature_strides = {} + + for i in range(cfg.MODEL.HRNET.STAGE4.NUM_BRANCHES): + self._out_features.append("p%d" % (i + 1)) + self._out_feature_channels.update( + {self._out_features[-1]: cfg.MODEL.HRNET.STAGE4.NUM_CHANNELS[i]} + ) + self._out_feature_strides.update({self._out_features[-1]: 1}) + + def _get_deconv_cfg(self, deconv_kernel): + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + + return deconv_kernel, padding, output_padding + + def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + nn.Conv2d( + num_channels_pre_layer[i], + num_channels_cur_layer[i], + 3, + 1, + 1, + bias=False, + ), + nn.BatchNorm2d(num_channels_cur_layer[i]), + nn.ReLU(inplace=True), + ) + ) + else: + transition_layers.append(None) + else: + conv3x3s = [] + for j in range(i + 1 - num_branches_pre): + inchannels = num_channels_pre_layer[-1] + outchannels = ( + num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels + ) + conv3x3s.append( + nn.Sequential( + nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False), + nn.BatchNorm2d(outchannels), + nn.ReLU(inplace=True), + ) + ) + transition_layers.append(nn.Sequential(*conv3x3s)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + ), + nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True): + num_modules = layer_config["NUM_MODULES"] + num_branches = layer_config["NUM_BRANCHES"] + num_blocks = layer_config["NUM_BLOCKS"] + num_channels = layer_config["NUM_CHANNELS"] + block = blocks_dict[layer_config["BLOCK"]] + + modules = [] + for i in range(num_modules): + # multi_scale_output is only used last module + if not multi_scale_output and i == num_modules - 1: + reset_multi_scale_output = False + else: + reset_multi_scale_output = True + + modules.append( + HighResolutionModule( + num_branches, + block, + num_blocks, + num_inchannels, + num_channels, + reset_multi_scale_output, + ) + ) + num_inchannels = modules[-1].get_num_inchannels() + + return nn.Sequential(*modules), num_inchannels + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg.NUM_BRANCHES): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg.NUM_BRANCHES): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg.NUM_BRANCHES): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage4(x_list) + + assert len(self._out_features) == len(y_list) + return dict(zip(self._out_features, y_list)) # final_outputs + + +@BACKBONE_REGISTRY.register() +def build_pose_hrnet_backbone(cfg, input_shape: ShapeSpec): + model = PoseHigherResolutionNet(cfg) + return model diff --git a/Leffa/densepose/modeling/inference.py b/Leffa/densepose/modeling/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..81049649edddb23aeebeac4085514da838f1463b --- /dev/null +++ b/Leffa/densepose/modeling/inference.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from dataclasses import fields +from typing import Any, List +import torch + +from detectron2.structures import Instances + + +def densepose_inference(densepose_predictor_output: Any, detections: List[Instances]) -> None: + """ + Splits DensePose predictor outputs into chunks, each chunk corresponds to + detections on one image. Predictor output chunks are stored in `pred_densepose` + attribute of the corresponding `Instances` object. + + Args: + densepose_predictor_output: a dataclass instance (can be of different types, + depending on predictor used for inference). Each field can be `None` + (if the corresponding output was not inferred) or a tensor of size + [N, ...], where N = N_1 + N_2 + .. + N_k is a total number of + detections on all images, N_1 is the number of detections on image 1, + N_2 is the number of detections on image 2, etc. + detections: a list of objects of type `Instance`, k-th object corresponds + to detections on k-th image. + """ + k = 0 + for detection_i in detections: + if densepose_predictor_output is None: + # don't add `pred_densepose` attribute + continue + n_i = detection_i.__len__() + + PredictorOutput = type(densepose_predictor_output) + output_i_dict = {} + # we assume here that `densepose_predictor_output` is a dataclass object + for field in fields(densepose_predictor_output): + field_value = getattr(densepose_predictor_output, field.name) + # slice tensors + if isinstance(field_value, torch.Tensor): + output_i_dict[field.name] = field_value[k : k + n_i] + # leave others as is + else: + output_i_dict[field.name] = field_value + detection_i.pred_densepose = PredictorOutput(**output_i_dict) + k += n_i diff --git a/Leffa/densepose/modeling/losses/__init__.py b/Leffa/densepose/modeling/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e5c593700e7274ea9cbaf8f4a52e8a229ef4c5a1 --- /dev/null +++ b/Leffa/densepose/modeling/losses/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .chart import DensePoseChartLoss +from .chart_with_confidences import DensePoseChartWithConfidenceLoss +from .cse import DensePoseCseLoss +from .registry import DENSEPOSE_LOSS_REGISTRY + + +__all__ = [ + "DensePoseChartLoss", + "DensePoseChartWithConfidenceLoss", + "DensePoseCseLoss", + "DENSEPOSE_LOSS_REGISTRY", +] diff --git a/Leffa/densepose/modeling/losses/chart.py b/Leffa/densepose/modeling/losses/chart.py new file mode 100644 index 0000000000000000000000000000000000000000..02cdae8db3a41fc197be7fcc792c7119c7a21726 --- /dev/null +++ b/Leffa/densepose/modeling/losses/chart.py @@ -0,0 +1,291 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any, List +import torch +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from .mask_or_segm import MaskOrSegmentationLoss +from .registry import DENSEPOSE_LOSS_REGISTRY +from .utils import ( + BilinearInterpolationHelper, + ChartBasedAnnotationsAccumulator, + LossDict, + extract_packed_annotations_from_matches, +) + + +@DENSEPOSE_LOSS_REGISTRY.register() +class DensePoseChartLoss: + """ + DensePose loss for chart-based training. A mesh is split into charts, + each chart is given a label (I) and parametrized by 2 coordinates referred to + as U and V. Ground truth consists of a number of points annotated with + I, U and V values and coarse segmentation S defined for all pixels of the + object bounding box. In some cases (see `COARSE_SEGM_TRAINED_BY_MASKS`), + semantic segmentation annotations can be used as ground truth inputs as well. + + Estimated values are tensors: + * U coordinates, tensor of shape [N, C, S, S] + * V coordinates, tensor of shape [N, C, S, S] + * fine segmentation estimates, tensor of shape [N, C, S, S] with raw unnormalized + scores for each fine segmentation label at each location + * coarse segmentation estimates, tensor of shape [N, D, S, S] with raw unnormalized + scores for each coarse segmentation label at each location + where N is the number of detections, C is the number of fine segmentation + labels, S is the estimate size ( = width = height) and D is the number of + coarse segmentation channels. + + The losses are: + * regression (smooth L1) loss for U and V coordinates + * cross entropy loss for fine (I) and coarse (S) segmentations + Each loss has an associated weight + """ + + def __init__(self, cfg: CfgNode): + """ + Initialize chart-based loss from configuration options + + Args: + cfg (CfgNode): configuration options + """ + # fmt: off + self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE + self.w_points = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS + self.w_part = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS + self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS + self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + # fmt: on + self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS + self.segm_loss = MaskOrSegmentationLoss(cfg) + + def __call__( + self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any, **kwargs + ) -> LossDict: + """ + Produce chart-based DensePose losses + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: an object of a dataclass that contains predictor outputs + with estimated values; assumed to have the following attributes: + * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] + * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] + * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] + * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] + where N is the number of detections, C is the number of fine segmentation + labels, S is the estimate size ( = width = height) and D is the number of + coarse segmentation channels. + + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_U`: smooth L1 loss for U coordinate estimates + * `loss_densepose_V`: smooth L1 loss for V coordinate estimates + * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine + segmentation estimates given ground truth labels; + * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse + segmentation estimates given ground truth labels; + """ + # densepose outputs are computed for all images and all bounding boxes; + # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively, + # the outputs will have size(0) == 3+1+2+1 == 7 + + if not len(proposals_with_gt): + return self.produce_fake_densepose_losses(densepose_predictor_outputs) + + accumulator = ChartBasedAnnotationsAccumulator() + packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator) + + # NOTE: we need to keep the same computation graph on all the GPUs to + # perform reduction properly. Hence even if we have no data on one + # of the GPUs, we still need to generate the computation graph. + # Add fake (zero) loss in the form Tensor.sum() * 0 + if packed_annotations is None: + return self.produce_fake_densepose_losses(densepose_predictor_outputs) + + h, w = densepose_predictor_outputs.u.shape[2:] + interpolator = BilinearInterpolationHelper.from_matches( + packed_annotations, + (h, w), + ) + + j_valid_fg = interpolator.j_valid * ( # pyre-ignore[16] + packed_annotations.fine_segm_labels_gt > 0 + ) + # pyre-fixme[6]: For 1st param expected `Tensor` but got `int`. + if not torch.any(j_valid_fg): + return self.produce_fake_densepose_losses(densepose_predictor_outputs) + + losses_uv = self.produce_densepose_losses_uv( + proposals_with_gt, + densepose_predictor_outputs, + packed_annotations, + interpolator, + j_valid_fg, # pyre-ignore[6] + ) + + losses_segm = self.produce_densepose_losses_segm( + proposals_with_gt, + densepose_predictor_outputs, + packed_annotations, + interpolator, + j_valid_fg, # pyre-ignore[6] + ) + + return {**losses_uv, **losses_segm} + + def produce_fake_densepose_losses(self, densepose_predictor_outputs: Any) -> LossDict: + """ + Fake losses for fine segmentation and U/V coordinates. These are used when + no suitable ground truth data was found in a batch. The loss has a value 0 + and is primarily used to construct the computation graph, so that + `DistributedDataParallel` has similar graphs on all GPUs and can perform + reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] + * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] + * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_U`: has value 0 + * `loss_densepose_V`: has value 0 + * `loss_densepose_I`: has value 0 + * `loss_densepose_S`: has value 0 + """ + losses_uv = self.produce_fake_densepose_losses_uv(densepose_predictor_outputs) + losses_segm = self.produce_fake_densepose_losses_segm(densepose_predictor_outputs) + return {**losses_uv, **losses_segm} + + def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict: + """ + Fake losses for U/V coordinates. These are used when no suitable ground + truth data was found in a batch. The loss has a value 0 + and is primarily used to construct the computation graph, so that + `DistributedDataParallel` has similar graphs on all GPUs and can perform + reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] + * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_U`: has value 0 + * `loss_densepose_V`: has value 0 + """ + return { + "loss_densepose_U": densepose_predictor_outputs.u.sum() * 0, + "loss_densepose_V": densepose_predictor_outputs.v.sum() * 0, + } + + def produce_fake_densepose_losses_segm(self, densepose_predictor_outputs: Any) -> LossDict: + """ + Fake losses for fine / coarse segmentation. These are used when + no suitable ground truth data was found in a batch. The loss has a value 0 + and is primarily used to construct the computation graph, so that + `DistributedDataParallel` has similar graphs on all GPUs and can perform + reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] + * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_I`: has value 0 + * `loss_densepose_S`: has value 0, added only if `segm_trained_by_masks` is False + """ + losses = { + "loss_densepose_I": densepose_predictor_outputs.fine_segm.sum() * 0, + "loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs), + } + return losses + + def produce_densepose_losses_uv( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: Any, + interpolator: BilinearInterpolationHelper, + j_valid_fg: torch.Tensor, + ) -> LossDict: + """ + Compute losses for U/V coordinates: smooth L1 loss between + estimated coordinates and the ground truth. + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S] + * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_U`: smooth L1 loss for U coordinate estimates + * `loss_densepose_V`: smooth L1 loss for V coordinate estimates + """ + u_gt = packed_annotations.u_gt[j_valid_fg] + u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg] + v_gt = packed_annotations.v_gt[j_valid_fg] + v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg] + return { + "loss_densepose_U": F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points, + "loss_densepose_V": F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points, + } + + def produce_densepose_losses_segm( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: Any, + interpolator: BilinearInterpolationHelper, + j_valid_fg: torch.Tensor, + ) -> LossDict: + """ + Losses for fine / coarse segmentation: cross-entropy + for segmentation unnormalized scores given ground truth labels at + annotated points for fine segmentation and dense mask annotations + for coarse segmentation. + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have the following attributes: + * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S] + * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] + Return: + dict: str -> tensor: dict of losses with the following entries: + * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine + segmentation estimates given ground truth labels + * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse + segmentation estimates given ground truth labels; + may be included if coarse segmentation is only trained + using DensePose ground truth; if additional supervision through + instance segmentation data is performed (`segm_trained_by_masks` is True), + this loss is handled by `produce_mask_losses` instead + """ + fine_segm_gt = packed_annotations.fine_segm_labels_gt[ + interpolator.j_valid # pyre-ignore[16] + ] + fine_segm_est = interpolator.extract_at_points( + densepose_predictor_outputs.fine_segm, + slice_fine_segm=slice(None), + w_ylo_xlo=interpolator.w_ylo_xlo[:, None], # pyre-ignore[16] + w_ylo_xhi=interpolator.w_ylo_xhi[:, None], # pyre-ignore[16] + w_yhi_xlo=interpolator.w_yhi_xlo[:, None], # pyre-ignore[16] + w_yhi_xhi=interpolator.w_yhi_xhi[:, None], # pyre-ignore[16] + )[interpolator.j_valid, :] + return { + "loss_densepose_I": F.cross_entropy(fine_segm_est, fine_segm_gt.long()) * self.w_part, + "loss_densepose_S": self.segm_loss( + proposals_with_gt, densepose_predictor_outputs, packed_annotations + ) + * self.w_segm, + } diff --git a/Leffa/densepose/modeling/losses/embed_utils.py b/Leffa/densepose/modeling/losses/embed_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f2ca16fd3809b89e1c05636242a84d02d3a42d88 --- /dev/null +++ b/Leffa/densepose/modeling/losses/embed_utils.py @@ -0,0 +1,137 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from dataclasses import dataclass +from typing import Any, Optional +import torch + +from detectron2.structures import BoxMode, Instances + +from .utils import AnnotationsAccumulator + + +@dataclass +class PackedCseAnnotations: + x_gt: torch.Tensor + y_gt: torch.Tensor + coarse_segm_gt: Optional[torch.Tensor] + vertex_mesh_ids_gt: torch.Tensor + vertex_ids_gt: torch.Tensor + bbox_xywh_gt: torch.Tensor + bbox_xywh_est: torch.Tensor + point_bbox_with_dp_indices: torch.Tensor + point_bbox_indices: torch.Tensor + bbox_indices: torch.Tensor + + +class CseAnnotationsAccumulator(AnnotationsAccumulator): + """ + Accumulates annotations by batches that correspond to objects detected on + individual images. Can pack them together into single tensors. + """ + + def __init__(self): + self.x_gt = [] + self.y_gt = [] + self.s_gt = [] + self.vertex_mesh_ids_gt = [] + self.vertex_ids_gt = [] + self.bbox_xywh_gt = [] + self.bbox_xywh_est = [] + self.point_bbox_with_dp_indices = [] + self.point_bbox_indices = [] + self.bbox_indices = [] + self.nxt_bbox_with_dp_index = 0 + self.nxt_bbox_index = 0 + + def accumulate(self, instances_one_image: Instances): + """ + Accumulate instances data for one image + + Args: + instances_one_image (Instances): instances data to accumulate + """ + boxes_xywh_est = BoxMode.convert( + instances_one_image.proposal_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS + ) + boxes_xywh_gt = BoxMode.convert( + instances_one_image.gt_boxes.tensor.clone(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS + ) + n_matches = len(boxes_xywh_gt) + assert n_matches == len( + boxes_xywh_est + ), f"Got {len(boxes_xywh_est)} proposal boxes and {len(boxes_xywh_gt)} GT boxes" + if not n_matches: + # no detection - GT matches + return + if ( + not hasattr(instances_one_image, "gt_densepose") + or instances_one_image.gt_densepose is None + ): + # no densepose GT for the detections, just increase the bbox index + self.nxt_bbox_index += n_matches + return + for box_xywh_est, box_xywh_gt, dp_gt in zip( + boxes_xywh_est, boxes_xywh_gt, instances_one_image.gt_densepose + ): + if (dp_gt is not None) and (len(dp_gt.x) > 0): + # pyre-fixme[6]: For 1st argument expected `Tensor` but got `float`. + # pyre-fixme[6]: For 2nd argument expected `Tensor` but got `float`. + self._do_accumulate(box_xywh_gt, box_xywh_est, dp_gt) + self.nxt_bbox_index += 1 + + def _do_accumulate(self, box_xywh_gt: torch.Tensor, box_xywh_est: torch.Tensor, dp_gt: Any): + """ + Accumulate instances data for one image, given that the data is not empty + + Args: + box_xywh_gt (tensor): GT bounding box + box_xywh_est (tensor): estimated bounding box + dp_gt: GT densepose data with the following attributes: + - x: normalized X coordinates + - y: normalized Y coordinates + - segm: tensor of size [S, S] with coarse segmentation + - + """ + self.x_gt.append(dp_gt.x) + self.y_gt.append(dp_gt.y) + if hasattr(dp_gt, "segm"): + self.s_gt.append(dp_gt.segm.unsqueeze(0)) + self.vertex_ids_gt.append(dp_gt.vertex_ids) + self.vertex_mesh_ids_gt.append(torch.full_like(dp_gt.vertex_ids, dp_gt.mesh_id)) + self.bbox_xywh_gt.append(box_xywh_gt.view(-1, 4)) + self.bbox_xywh_est.append(box_xywh_est.view(-1, 4)) + self.point_bbox_with_dp_indices.append( + torch.full_like(dp_gt.vertex_ids, self.nxt_bbox_with_dp_index) + ) + self.point_bbox_indices.append(torch.full_like(dp_gt.vertex_ids, self.nxt_bbox_index)) + self.bbox_indices.append(self.nxt_bbox_index) + self.nxt_bbox_with_dp_index += 1 + + def pack(self) -> Optional[PackedCseAnnotations]: + """ + Pack data into tensors + """ + if not len(self.x_gt): + # TODO: + # returning proper empty annotations would require + # creating empty tensors of appropriate shape and + # type on an appropriate device; + # we return None so far to indicate empty annotations + return None + return PackedCseAnnotations( + x_gt=torch.cat(self.x_gt, 0), + y_gt=torch.cat(self.y_gt, 0), + vertex_mesh_ids_gt=torch.cat(self.vertex_mesh_ids_gt, 0), + vertex_ids_gt=torch.cat(self.vertex_ids_gt, 0), + # ignore segmentation annotations, if not all the instances contain those + coarse_segm_gt=torch.cat(self.s_gt, 0) + if len(self.s_gt) == len(self.bbox_xywh_gt) + else None, + bbox_xywh_gt=torch.cat(self.bbox_xywh_gt, 0), + bbox_xywh_est=torch.cat(self.bbox_xywh_est, 0), + point_bbox_with_dp_indices=torch.cat(self.point_bbox_with_dp_indices, 0), + point_bbox_indices=torch.cat(self.point_bbox_indices, 0), + bbox_indices=torch.as_tensor( + self.bbox_indices, dtype=torch.long, device=self.x_gt[0].device + ), + ) diff --git a/Leffa/densepose/modeling/losses/mask_or_segm.py b/Leffa/densepose/modeling/losses/mask_or_segm.py new file mode 100644 index 0000000000000000000000000000000000000000..f12151993fb16fb0109bee8c667565f590d866de --- /dev/null +++ b/Leffa/densepose/modeling/losses/mask_or_segm.py @@ -0,0 +1,77 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from typing import Any, List + +import torch + +from detectron2.config import CfgNode +from detectron2.structures import Instances + +from .mask import MaskLoss +from .segm import SegmentationLoss + + +class MaskOrSegmentationLoss: + """ + Mask or segmentation loss as cross-entropy for raw unnormalized scores + given ground truth labels. Ground truth labels are either defined by coarse + segmentation annotation, or by mask annotation, depending on the config + value MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS + """ + + def __init__(self, cfg: CfgNode): + """ + Initialize segmentation loss from configuration options + + Args: + cfg (CfgNode): configuration options + """ + self.segm_trained_by_masks = ( + cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS + ) + if self.segm_trained_by_masks: + self.mask_loss = MaskLoss() + self.segm_loss = SegmentationLoss(cfg) + + def __call__( + self, + proposals_with_gt: List[Instances], + densepose_predictor_outputs: Any, + packed_annotations: Any, + ) -> torch.Tensor: + """ + Compute segmentation loss as cross-entropy between aligned unnormalized + score estimates and ground truth; with ground truth given + either by masks, or by coarse segmentation annotations. + + Args: + proposals_with_gt (list of Instances): detections with associated ground truth data + densepose_predictor_outputs: an object of a dataclass that contains predictor outputs + with estimated values; assumed to have the following attributes: + * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S] + packed_annotations: packed annotations for efficient loss computation + Return: + tensor: loss value as cross-entropy for raw unnormalized scores + given ground truth labels + """ + if self.segm_trained_by_masks: + return self.mask_loss(proposals_with_gt, densepose_predictor_outputs) + return self.segm_loss( + proposals_with_gt, densepose_predictor_outputs, packed_annotations + ) + + def fake_value(self, densepose_predictor_outputs: Any) -> torch.Tensor: + """ + Fake segmentation loss used when no suitable ground truth data + was found in a batch. The loss has a value 0 and is primarily used to + construct the computation graph, so that `DistributedDataParallel` + has similar graphs on all GPUs and can perform reduction properly. + + Args: + densepose_predictor_outputs: DensePose predictor outputs, an object + of a dataclass that is assumed to have `coarse_segm` + attribute + Return: + Zero value loss with proper computation graph + """ + return densepose_predictor_outputs.coarse_segm.sum() * 0 diff --git a/Leffa/densepose/modeling/predictors/__init__.py b/Leffa/densepose/modeling/predictors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ece0757acf2a4924079c884cab44a71cea22c37 --- /dev/null +++ b/Leffa/densepose/modeling/predictors/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .chart import DensePoseChartPredictor +from .chart_confidence import DensePoseChartConfidencePredictorMixin +from .chart_with_confidence import DensePoseChartWithConfidencePredictor +from .cse import DensePoseEmbeddingPredictor +from .cse_confidence import DensePoseEmbeddingConfidencePredictorMixin +from .cse_with_confidence import DensePoseEmbeddingWithConfidencePredictor +from .registry import DENSEPOSE_PREDICTOR_REGISTRY diff --git a/Leffa/densepose/modeling/predictors/chart.py b/Leffa/densepose/modeling/predictors/chart.py new file mode 100644 index 0000000000000000000000000000000000000000..3bcd13f7c592e37c2751556cda1f6e9cd3400b73 --- /dev/null +++ b/Leffa/densepose/modeling/predictors/chart.py @@ -0,0 +1,94 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import torch +from torch import nn + +from detectron2.config import CfgNode +from detectron2.layers import ConvTranspose2d, interpolate + +from ...structures import DensePoseChartPredictorOutput +from ..utils import initialize_module_params +from .registry import DENSEPOSE_PREDICTOR_REGISTRY + + +@DENSEPOSE_PREDICTOR_REGISTRY.register() +class DensePoseChartPredictor(nn.Module): + """ + Predictor (last layers of a DensePose model) that takes DensePose head outputs as an input + and produces 4 tensors which represent DensePose results for predefined body parts + (patches / charts): + * coarse segmentation, a tensor of shape [N, K, Hout, Wout] + * fine segmentation, a tensor of shape [N, C, Hout, Wout] + * U coordinates, a tensor of shape [N, C, Hout, Wout] + * V coordinates, a tensor of shape [N, C, Hout, Wout] + where + - N is the number of instances + - K is the number of coarse segmentation channels ( + 2 = foreground / background, + 15 = one of 14 body parts / background) + - C is the number of fine segmentation channels ( + 24 fine body parts / background) + - Hout and Wout are height and width of predictions + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize predictor using configuration options + + Args: + cfg (CfgNode): configuration options + input_channels (int): input tensor size along the channel dimension + """ + super().__init__() + dim_in = input_channels + n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1 + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + # coarse segmentation + self.ann_index_lowres = ConvTranspose2d( + dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + # fine segmentation + self.index_uv_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + # U + self.u_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + # V + self.v_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE + initialize_module_params(self) + + def interp2d(self, tensor_nchw: torch.Tensor): + """ + Bilinear interpolation method to be used for upscaling + + Args: + tensor_nchw (tensor): tensor of shape (N, C, H, W) + Return: + tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed + by applying the scale factor to H and W + """ + return interpolate( + tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False + ) + + def forward(self, head_outputs: torch.Tensor): + """ + Perform forward step on DensePose head outputs + + Args: + head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W] + Return: + An instance of DensePoseChartPredictorOutput + """ + return DensePoseChartPredictorOutput( + coarse_segm=self.interp2d(self.ann_index_lowres(head_outputs)), + fine_segm=self.interp2d(self.index_uv_lowres(head_outputs)), + u=self.interp2d(self.u_lowres(head_outputs)), + v=self.interp2d(self.v_lowres(head_outputs)), + ) diff --git a/Leffa/densepose/modeling/predictors/chart_confidence.py b/Leffa/densepose/modeling/predictors/chart_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..0c0099952f3e675e42aa7d3b6d35065fdaf43dbb --- /dev/null +++ b/Leffa/densepose/modeling/predictors/chart_confidence.py @@ -0,0 +1,174 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any +import torch +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.layers import ConvTranspose2d + +from ...structures import decorate_predictor_output_class_with_confidences +from ..confidence import DensePoseConfidenceModelConfig, DensePoseUVConfidenceType +from ..utils import initialize_module_params + + +class DensePoseChartConfidencePredictorMixin: + """ + Predictor contains the last layers of a DensePose model that take DensePose head + outputs as an input and produce model outputs. Confidence predictor mixin is used + to generate confidences for segmentation and UV tensors estimated by some + base predictor. Several assumptions need to hold for the base predictor: + 1) the `forward` method must return SIUV tuple as the first result ( + S = coarse segmentation, I = fine segmentation, U and V are intrinsic + chart coordinates) + 2) `interp2d` method must be defined to perform bilinear interpolation; + the same method is typically used for SIUV and confidences + Confidence predictor mixin provides confidence estimates, as described in: + N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences + from Noisy Labels, NeurIPS 2019 + A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020 + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize confidence predictor using configuration options. + + Args: + cfg (CfgNode): configuration options + input_channels (int): number of input channels + """ + # we rely on base predictor to call nn.Module.__init__ + super().__init__(cfg, input_channels) # pyre-ignore[19] + self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg) + self._initialize_confidence_estimation_layers(cfg, input_channels) + self._registry = {} + initialize_module_params(self) # pyre-ignore[6] + + def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int): + """ + Initialize confidence estimation layers based on configuration options + + Args: + cfg (CfgNode): configuration options + dim_in (int): number of input channels + """ + dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1 + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + if self.confidence_model_cfg.uv_confidence.enabled: + if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO: + self.sigma_2_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + elif ( + self.confidence_model_cfg.uv_confidence.type + == DensePoseUVConfidenceType.INDEP_ANISO + ): + self.sigma_2_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.kappa_u_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.kappa_v_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + else: + raise ValueError( + f"Unknown confidence model type: " + f"{self.confidence_model_cfg.confidence_model_type}" + ) + if self.confidence_model_cfg.segm_confidence.enabled: + self.fine_segm_confidence_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.coarse_segm_confidence_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + + def forward(self, head_outputs: torch.Tensor): + """ + Perform forward operation on head outputs used as inputs for the predictor. + Calls forward method from the base predictor and uses its outputs to compute + confidences. + + Args: + head_outputs (Tensor): head outputs used as predictor inputs + Return: + An instance of outputs with confidences, + see `decorate_predictor_output_class_with_confidences` + """ + # assuming base class returns SIUV estimates in its first result + base_predictor_outputs = super().forward(head_outputs) # pyre-ignore[16] + + # create output instance by extending base predictor outputs: + output = self._create_output_instance(base_predictor_outputs) + + if self.confidence_model_cfg.uv_confidence.enabled: + if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO: + # assuming base class defines interp2d method for bilinear interpolation + output.sigma_2 = self.interp2d(self.sigma_2_lowres(head_outputs)) # pyre-ignore[16] + elif ( + self.confidence_model_cfg.uv_confidence.type + == DensePoseUVConfidenceType.INDEP_ANISO + ): + # assuming base class defines interp2d method for bilinear interpolation + output.sigma_2 = self.interp2d(self.sigma_2_lowres(head_outputs)) + output.kappa_u = self.interp2d(self.kappa_u_lowres(head_outputs)) # pyre-ignore[16] + output.kappa_v = self.interp2d(self.kappa_v_lowres(head_outputs)) # pyre-ignore[16] + else: + raise ValueError( + f"Unknown confidence model type: " + f"{self.confidence_model_cfg.confidence_model_type}" + ) + if self.confidence_model_cfg.segm_confidence.enabled: + # base predictor outputs are assumed to have `fine_segm` and `coarse_segm` attributes + # base predictor is assumed to define `interp2d` method for bilinear interpolation + output.fine_segm_confidence = ( + F.softplus( + self.interp2d(self.fine_segm_confidence_lowres(head_outputs)) # pyre-ignore[16] + ) + + self.confidence_model_cfg.segm_confidence.epsilon + ) + output.fine_segm = base_predictor_outputs.fine_segm * torch.repeat_interleave( + output.fine_segm_confidence, base_predictor_outputs.fine_segm.shape[1], dim=1 + ) + output.coarse_segm_confidence = ( + F.softplus( + self.interp2d( + self.coarse_segm_confidence_lowres(head_outputs) # pyre-ignore[16] + ) + ) + + self.confidence_model_cfg.segm_confidence.epsilon + ) + output.coarse_segm = base_predictor_outputs.coarse_segm * torch.repeat_interleave( + output.coarse_segm_confidence, base_predictor_outputs.coarse_segm.shape[1], dim=1 + ) + + return output + + def _create_output_instance(self, base_predictor_outputs: Any): + """ + Create an instance of predictor outputs by copying the outputs from the + base predictor and initializing confidence + + Args: + base_predictor_outputs: an instance of base predictor outputs + (the outputs type is assumed to be a dataclass) + Return: + An instance of outputs with confidences + """ + PredictorOutput = decorate_predictor_output_class_with_confidences( + type(base_predictor_outputs) # pyre-ignore[6] + ) + # base_predictor_outputs is assumed to be a dataclass + # reassign all the fields from base_predictor_outputs (no deep copy!), add new fields + output = PredictorOutput( + **base_predictor_outputs.__dict__, + coarse_segm_confidence=None, + fine_segm_confidence=None, + sigma_1=None, + sigma_2=None, + kappa_u=None, + kappa_v=None, + ) + return output diff --git a/Leffa/densepose/modeling/predictors/chart_with_confidence.py b/Leffa/densepose/modeling/predictors/chart_with_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..9c1cd6cc8fda56e831fbc02a8ffdd844866c0e4f --- /dev/null +++ b/Leffa/densepose/modeling/predictors/chart_with_confidence.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from . import DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor +from .registry import DENSEPOSE_PREDICTOR_REGISTRY + + +@DENSEPOSE_PREDICTOR_REGISTRY.register() +class DensePoseChartWithConfidencePredictor( + DensePoseChartConfidencePredictorMixin, DensePoseChartPredictor +): + """ + Predictor that combines chart and chart confidence estimation + """ + + pass diff --git a/Leffa/densepose/modeling/predictors/cse.py b/Leffa/densepose/modeling/predictors/cse.py new file mode 100644 index 0000000000000000000000000000000000000000..466a5ecddbfa338a2b603facf06d1f4510fff6eb --- /dev/null +++ b/Leffa/densepose/modeling/predictors/cse.py @@ -0,0 +1,70 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import torch +from torch import nn + +from detectron2.config import CfgNode +from detectron2.layers import ConvTranspose2d, interpolate + +from ...structures import DensePoseEmbeddingPredictorOutput +from ..utils import initialize_module_params +from .registry import DENSEPOSE_PREDICTOR_REGISTRY + + +@DENSEPOSE_PREDICTOR_REGISTRY.register() +class DensePoseEmbeddingPredictor(nn.Module): + """ + Last layers of a DensePose model that take DensePose head outputs as an input + and produce model outputs for continuous surface embeddings (CSE). + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize predictor using configuration options + + Args: + cfg (CfgNode): configuration options + input_channels (int): input tensor size along the channel dimension + """ + super().__init__() + dim_in = input_channels + n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + embed_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CSE.EMBED_SIZE + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + # coarse segmentation + self.coarse_segm_lowres = ConvTranspose2d( + dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + # embedding + self.embed_lowres = ConvTranspose2d( + dim_in, embed_size, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE + initialize_module_params(self) + + def interp2d(self, tensor_nchw: torch.Tensor): + """ + Bilinear interpolation method to be used for upscaling + + Args: + tensor_nchw (tensor): tensor of shape (N, C, H, W) + Return: + tensor of shape (N, C, Hout, Wout), where Hout and Wout are computed + by applying the scale factor to H and W + """ + return interpolate( + tensor_nchw, scale_factor=self.scale_factor, mode="bilinear", align_corners=False + ) + + def forward(self, head_outputs): + """ + Perform forward step on DensePose head outputs + + Args: + head_outputs (tensor): DensePose head outputs, tensor of shape [N, D, H, W] + """ + embed_lowres = self.embed_lowres(head_outputs) + coarse_segm_lowres = self.coarse_segm_lowres(head_outputs) + embed = self.interp2d(embed_lowres) + coarse_segm = self.interp2d(coarse_segm_lowres) + return DensePoseEmbeddingPredictorOutput(embedding=embed, coarse_segm=coarse_segm) diff --git a/Leffa/densepose/modeling/predictors/cse_confidence.py b/Leffa/densepose/modeling/predictors/cse_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..8220337cea8eb87bbdf74378079551259dcc37e2 --- /dev/null +++ b/Leffa/densepose/modeling/predictors/cse_confidence.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from typing import Any +import torch +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.layers import ConvTranspose2d + +from densepose.modeling.confidence import DensePoseConfidenceModelConfig +from densepose.modeling.utils import initialize_module_params +from densepose.structures import decorate_cse_predictor_output_class_with_confidences + + +class DensePoseEmbeddingConfidencePredictorMixin: + """ + Predictor contains the last layers of a DensePose model that take DensePose head + outputs as an input and produce model outputs. Confidence predictor mixin is used + to generate confidences for coarse segmentation estimated by some + base predictor. Several assumptions need to hold for the base predictor: + 1) the `forward` method must return CSE DensePose head outputs, + tensor of shape [N, D, H, W] + 2) `interp2d` method must be defined to perform bilinear interpolation; + the same method is typically used for masks and confidences + Confidence predictor mixin provides confidence estimates, as described in: + N. Neverova et al., Correlated Uncertainty for Learning Dense Correspondences + from Noisy Labels, NeurIPS 2019 + A. Sanakoyeu et al., Transferring Dense Pose to Proximal Animal Classes, CVPR 2020 + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize confidence predictor using configuration options. + + Args: + cfg (CfgNode): configuration options + input_channels (int): number of input channels + """ + # we rely on base predictor to call nn.Module.__init__ + super().__init__(cfg, input_channels) # pyre-ignore[19] + self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg) + self._initialize_confidence_estimation_layers(cfg, input_channels) + self._registry = {} + initialize_module_params(self) # pyre-ignore[6] + + def _initialize_confidence_estimation_layers(self, cfg: CfgNode, dim_in: int): + """ + Initialize confidence estimation layers based on configuration options + + Args: + cfg (CfgNode): configuration options + dim_in (int): number of input channels + """ + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + if self.confidence_model_cfg.segm_confidence.enabled: + self.coarse_segm_confidence_lowres = ConvTranspose2d( # pyre-ignore[16] + dim_in, 1, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + + def forward(self, head_outputs: torch.Tensor): + """ + Perform forward operation on head outputs used as inputs for the predictor. + Calls forward method from the base predictor and uses its outputs to compute + confidences. + + Args: + head_outputs (Tensor): head outputs used as predictor inputs + Return: + An instance of outputs with confidences, + see `decorate_cse_predictor_output_class_with_confidences` + """ + # assuming base class returns SIUV estimates in its first result + base_predictor_outputs = super().forward(head_outputs) # pyre-ignore[16] + + # create output instance by extending base predictor outputs: + output = self._create_output_instance(base_predictor_outputs) + + if self.confidence_model_cfg.segm_confidence.enabled: + # base predictor outputs are assumed to have `coarse_segm` attribute + # base predictor is assumed to define `interp2d` method for bilinear interpolation + output.coarse_segm_confidence = ( + F.softplus( + self.interp2d( # pyre-ignore[16] + self.coarse_segm_confidence_lowres(head_outputs) # pyre-ignore[16] + ) + ) + + self.confidence_model_cfg.segm_confidence.epsilon + ) + output.coarse_segm = base_predictor_outputs.coarse_segm * torch.repeat_interleave( + output.coarse_segm_confidence, base_predictor_outputs.coarse_segm.shape[1], dim=1 + ) + + return output + + def _create_output_instance(self, base_predictor_outputs: Any): + """ + Create an instance of predictor outputs by copying the outputs from the + base predictor and initializing confidence + + Args: + base_predictor_outputs: an instance of base predictor outputs + (the outputs type is assumed to be a dataclass) + Return: + An instance of outputs with confidences + """ + PredictorOutput = decorate_cse_predictor_output_class_with_confidences( + type(base_predictor_outputs) # pyre-ignore[6] + ) + # base_predictor_outputs is assumed to be a dataclass + # reassign all the fields from base_predictor_outputs (no deep copy!), add new fields + output = PredictorOutput( + **base_predictor_outputs.__dict__, + coarse_segm_confidence=None, + ) + return output diff --git a/Leffa/densepose/modeling/predictors/cse_with_confidence.py b/Leffa/densepose/modeling/predictors/cse_with_confidence.py new file mode 100644 index 0000000000000000000000000000000000000000..17ecef67ffb67cd0e64de73632eaede1d8f3c701 --- /dev/null +++ b/Leffa/densepose/modeling/predictors/cse_with_confidence.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from . import DensePoseEmbeddingConfidencePredictorMixin, DensePoseEmbeddingPredictor +from .registry import DENSEPOSE_PREDICTOR_REGISTRY + + +@DENSEPOSE_PREDICTOR_REGISTRY.register() +class DensePoseEmbeddingWithConfidencePredictor( + DensePoseEmbeddingConfidencePredictorMixin, DensePoseEmbeddingPredictor +): + """ + Predictor that combines CSE and CSE confidence estimation + """ + + pass diff --git a/Leffa/densepose/modeling/predictors/registry.py b/Leffa/densepose/modeling/predictors/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..f96901d3242fa8f3d35d053ed0bdd7649a045b88 --- /dev/null +++ b/Leffa/densepose/modeling/predictors/registry.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from detectron2.utils.registry import Registry + +DENSEPOSE_PREDICTOR_REGISTRY = Registry("DENSEPOSE_PREDICTOR") diff --git a/Leffa/densepose/modeling/roi_heads/__init__.py b/Leffa/densepose/modeling/roi_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8403589f23ec2ffa8afafcd566ca0b0b7b2671a7 --- /dev/null +++ b/Leffa/densepose/modeling/roi_heads/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from .v1convx import DensePoseV1ConvXHead +from .deeplab import DensePoseDeepLabHead +from .registry import ROI_DENSEPOSE_HEAD_REGISTRY +from .roi_head import Decoder, DensePoseROIHeads diff --git a/Leffa/densepose/modeling/roi_heads/deeplab.py b/Leffa/densepose/modeling/roi_heads/deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..4e5cb483037b302ff1fb2c305275a65e4ba4e941 --- /dev/null +++ b/Leffa/densepose/modeling/roi_heads/deeplab.py @@ -0,0 +1,263 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.layers import Conv2d + +from .registry import ROI_DENSEPOSE_HEAD_REGISTRY + + +@ROI_DENSEPOSE_HEAD_REGISTRY.register() +class DensePoseDeepLabHead(nn.Module): + """ + DensePose head using DeepLabV3 model from + "Rethinking Atrous Convolution for Semantic Image Segmentation" + . + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + super(DensePoseDeepLabHead, self).__init__() + # fmt: off + hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL + norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM + self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS + self.use_nonlocal = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON + # fmt: on + pad_size = kernel_size // 2 + n_channels = input_channels + + self.ASPP = ASPP(input_channels, [6, 12, 56], n_channels) # 6, 12, 56 + self.add_module("ASPP", self.ASPP) + + if self.use_nonlocal: + self.NLBlock = NONLocalBlock2D(input_channels, bn_layer=True) + self.add_module("NLBlock", self.NLBlock) + # weight_init.c2_msra_fill(self.ASPP) + + for i in range(self.n_stacked_convs): + norm_module = nn.GroupNorm(32, hidden_dim) if norm == "GN" else None + layer = Conv2d( + n_channels, + hidden_dim, + kernel_size, + stride=1, + padding=pad_size, + bias=not norm, + norm=norm_module, + ) + weight_init.c2_msra_fill(layer) + n_channels = hidden_dim + layer_name = self._get_layer_name(i) + self.add_module(layer_name, layer) + self.n_out_channels = hidden_dim + # initialize_module_params(self) + + def forward(self, features): + x0 = features + x = self.ASPP(x0) + if self.use_nonlocal: + x = self.NLBlock(x) + output = x + for i in range(self.n_stacked_convs): + layer_name = self._get_layer_name(i) + x = getattr(self, layer_name)(x) + x = F.relu(x) + output = x + return output + + def _get_layer_name(self, i: int): + layer_name = "body_conv_fcn{}".format(i + 1) + return layer_name + + +# Copied from +# https://github.com/pytorch/vision/blob/master/torchvision/models/segmentation/deeplabv3.py +# See https://arxiv.org/pdf/1706.05587.pdf for details +class ASPPConv(nn.Sequential): + def __init__(self, in_channels, out_channels, dilation): + modules = [ + nn.Conv2d( + in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False + ), + nn.GroupNorm(32, out_channels), + nn.ReLU(), + ] + super(ASPPConv, self).__init__(*modules) + + +class ASPPPooling(nn.Sequential): + def __init__(self, in_channels, out_channels): + super(ASPPPooling, self).__init__( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(in_channels, out_channels, 1, bias=False), + nn.GroupNorm(32, out_channels), + nn.ReLU(), + ) + + def forward(self, x): + size = x.shape[-2:] + x = super(ASPPPooling, self).forward(x) + return F.interpolate(x, size=size, mode="bilinear", align_corners=False) + + +class ASPP(nn.Module): + def __init__(self, in_channels, atrous_rates, out_channels): + super(ASPP, self).__init__() + modules = [] + modules.append( + nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1, bias=False), + nn.GroupNorm(32, out_channels), + nn.ReLU(), + ) + ) + + rate1, rate2, rate3 = tuple(atrous_rates) + modules.append(ASPPConv(in_channels, out_channels, rate1)) + modules.append(ASPPConv(in_channels, out_channels, rate2)) + modules.append(ASPPConv(in_channels, out_channels, rate3)) + modules.append(ASPPPooling(in_channels, out_channels)) + + self.convs = nn.ModuleList(modules) + + self.project = nn.Sequential( + nn.Conv2d(5 * out_channels, out_channels, 1, bias=False), + # nn.BatchNorm2d(out_channels), + nn.ReLU() + # nn.Dropout(0.5) + ) + + def forward(self, x): + res = [] + for conv in self.convs: + res.append(conv(x)) + res = torch.cat(res, dim=1) + return self.project(res) + + +# copied from +# https://github.com/AlexHex7/Non-local_pytorch/blob/master/lib/non_local_embedded_gaussian.py +# See https://arxiv.org/abs/1711.07971 for details +class _NonLocalBlockND(nn.Module): + def __init__( + self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True + ): + super(_NonLocalBlockND, self).__init__() + + assert dimension in [1, 2, 3] + + self.dimension = dimension + self.sub_sample = sub_sample + + self.in_channels = in_channels + self.inter_channels = inter_channels + + if self.inter_channels is None: + self.inter_channels = in_channels // 2 + if self.inter_channels == 0: + self.inter_channels = 1 + + if dimension == 3: + conv_nd = nn.Conv3d + max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) + bn = nn.GroupNorm # (32, hidden_dim) #nn.BatchNorm3d + elif dimension == 2: + conv_nd = nn.Conv2d + max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) + bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm2d + else: + conv_nd = nn.Conv1d + max_pool_layer = nn.MaxPool1d(kernel_size=2) + bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm1d + + self.g = conv_nd( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1, + stride=1, + padding=0, + ) + + if bn_layer: + self.W = nn.Sequential( + conv_nd( + in_channels=self.inter_channels, + out_channels=self.in_channels, + kernel_size=1, + stride=1, + padding=0, + ), + bn(32, self.in_channels), + ) + nn.init.constant_(self.W[1].weight, 0) + nn.init.constant_(self.W[1].bias, 0) + else: + self.W = conv_nd( + in_channels=self.inter_channels, + out_channels=self.in_channels, + kernel_size=1, + stride=1, + padding=0, + ) + nn.init.constant_(self.W.weight, 0) + nn.init.constant_(self.W.bias, 0) + + self.theta = conv_nd( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1, + stride=1, + padding=0, + ) + self.phi = conv_nd( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1, + stride=1, + padding=0, + ) + + if sub_sample: + self.g = nn.Sequential(self.g, max_pool_layer) + self.phi = nn.Sequential(self.phi, max_pool_layer) + + def forward(self, x): + """ + :param x: (b, c, t, h, w) + :return: + """ + + batch_size = x.size(0) + + g_x = self.g(x).view(batch_size, self.inter_channels, -1) + g_x = g_x.permute(0, 2, 1) + + theta_x = self.theta(x).view(batch_size, self.inter_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + phi_x = self.phi(x).view(batch_size, self.inter_channels, -1) + f = torch.matmul(theta_x, phi_x) + f_div_C = F.softmax(f, dim=-1) + + y = torch.matmul(f_div_C, g_x) + y = y.permute(0, 2, 1).contiguous() + y = y.view(batch_size, self.inter_channels, *x.size()[2:]) + W_y = self.W(y) + z = W_y + x + + return z + + +class NONLocalBlock2D(_NonLocalBlockND): + def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): + super(NONLocalBlock2D, self).__init__( + in_channels, + inter_channels=inter_channels, + dimension=2, + sub_sample=sub_sample, + bn_layer=bn_layer, + ) diff --git a/Leffa/densepose/modeling/roi_heads/registry.py b/Leffa/densepose/modeling/roi_heads/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..e1cea432f1fda3861266fa636d002667b3fb46a0 --- /dev/null +++ b/Leffa/densepose/modeling/roi_heads/registry.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from detectron2.utils.registry import Registry + +ROI_DENSEPOSE_HEAD_REGISTRY = Registry("ROI_DENSEPOSE_HEAD") diff --git a/Leffa/densepose/modeling/roi_heads/roi_head.py b/Leffa/densepose/modeling/roi_heads/roi_head.py new file mode 100644 index 0000000000000000000000000000000000000000..aee645fde0d8321de9181a624a0c921b6dc167c4 --- /dev/null +++ b/Leffa/densepose/modeling/roi_heads/roi_head.py @@ -0,0 +1,218 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import numpy as np +from typing import Dict, List, Optional +import fvcore.nn.weight_init as weight_init +import torch +import torch.nn as nn +from torch.nn import functional as F + +from detectron2.layers import Conv2d, ShapeSpec, get_norm +from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads +from detectron2.modeling.poolers import ROIPooler +from detectron2.modeling.roi_heads import select_foreground_proposals +from detectron2.structures import ImageList, Instances + +from .. import ( + build_densepose_data_filter, + build_densepose_embedder, + build_densepose_head, + build_densepose_losses, + build_densepose_predictor, + densepose_inference, +) + + +class Decoder(nn.Module): + """ + A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper + (https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from + all levels of the FPN into single output. + """ + + def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features): + super(Decoder, self).__init__() + + # fmt: off + self.in_features = in_features + feature_strides = {k: v.stride for k, v in input_shape.items()} + feature_channels = {k: v.channels for k, v in input_shape.items()} + num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES + conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS + self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE + norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM + # fmt: on + + self.scale_heads = [] + for in_feature in self.in_features: + head_ops = [] + head_length = max( + 1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride)) + ) + for k in range(head_length): + conv = Conv2d( + feature_channels[in_feature] if k == 0 else conv_dims, + conv_dims, + kernel_size=3, + stride=1, + padding=1, + bias=not norm, + norm=get_norm(norm, conv_dims), + activation=F.relu, + ) + weight_init.c2_msra_fill(conv) + head_ops.append(conv) + if feature_strides[in_feature] != self.common_stride: + head_ops.append( + nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) + ) + self.scale_heads.append(nn.Sequential(*head_ops)) + self.add_module(in_feature, self.scale_heads[-1]) + self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) + weight_init.c2_msra_fill(self.predictor) + + def forward(self, features: List[torch.Tensor]): + for i, _ in enumerate(self.in_features): + if i == 0: + x = self.scale_heads[i](features[i]) + else: + x = x + self.scale_heads[i](features[i]) + x = self.predictor(x) + return x + + +@ROI_HEADS_REGISTRY.register() +class DensePoseROIHeads(StandardROIHeads): + """ + A Standard ROIHeads which contains an addition of DensePose head. + """ + + def __init__(self, cfg, input_shape): + super().__init__(cfg, input_shape) + self._init_densepose_head(cfg, input_shape) + + def _init_densepose_head(self, cfg, input_shape): + # fmt: off + self.densepose_on = cfg.MODEL.DENSEPOSE_ON + if not self.densepose_on: + return + self.densepose_data_filter = build_densepose_data_filter(cfg) + dp_pooler_resolution = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION + dp_pooler_sampling_ratio = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO + dp_pooler_type = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE + self.use_decoder = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON + # fmt: on + if self.use_decoder: + dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,) + else: + dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features) + in_channels = [input_shape[f].channels for f in self.in_features][0] + + if self.use_decoder: + self.decoder = Decoder(cfg, input_shape, self.in_features) + + self.densepose_pooler = ROIPooler( + output_size=dp_pooler_resolution, + scales=dp_pooler_scales, + sampling_ratio=dp_pooler_sampling_ratio, + pooler_type=dp_pooler_type, + ) + self.densepose_head = build_densepose_head(cfg, in_channels) + self.densepose_predictor = build_densepose_predictor( + cfg, self.densepose_head.n_out_channels + ) + self.densepose_losses = build_densepose_losses(cfg) + self.embedder = build_densepose_embedder(cfg) + + def _forward_densepose(self, features: Dict[str, torch.Tensor], instances: List[Instances]): + """ + Forward logic of the densepose prediction branch. + + Args: + features (dict[str, Tensor]): input data as a mapping from feature + map name to tensor. Axis 0 represents the number of images `N` in + the input data; axes 1-3 are channels, height, and width, which may + vary between feature maps (e.g., if a feature pyramid is used). + instances (list[Instances]): length `N` list of `Instances`. The i-th + `Instances` contains instances for the i-th input image, + In training, they can be the proposals. + In inference, they can be the predicted boxes. + + Returns: + In training, a dict of losses. + In inference, update `instances` with new fields "densepose" and return it. + """ + if not self.densepose_on: + return {} if self.training else instances + + features_list = [features[f] for f in self.in_features] + if self.training: + proposals, _ = select_foreground_proposals(instances, self.num_classes) + features_list, proposals = self.densepose_data_filter(features_list, proposals) + if len(proposals) > 0: + proposal_boxes = [x.proposal_boxes for x in proposals] + + if self.use_decoder: + features_list = [self.decoder(features_list)] + + features_dp = self.densepose_pooler(features_list, proposal_boxes) + densepose_head_outputs = self.densepose_head(features_dp) + densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs) + densepose_loss_dict = self.densepose_losses( + proposals, densepose_predictor_outputs, embedder=self.embedder + ) + return densepose_loss_dict + else: + pred_boxes = [x.pred_boxes for x in instances] + + if self.use_decoder: + features_list = [self.decoder(features_list)] + + features_dp = self.densepose_pooler(features_list, pred_boxes) + if len(features_dp) > 0: + densepose_head_outputs = self.densepose_head(features_dp) + densepose_predictor_outputs = self.densepose_predictor(densepose_head_outputs) + else: + densepose_predictor_outputs = None + + densepose_inference(densepose_predictor_outputs, instances) + return instances + + def forward( + self, + images: ImageList, + features: Dict[str, torch.Tensor], + proposals: List[Instances], + targets: Optional[List[Instances]] = None, + ): + instances, losses = super().forward(images, features, proposals, targets) + del targets, images + + if self.training: + losses.update(self._forward_densepose(features, instances)) + return instances, losses + + def forward_with_given_boxes( + self, features: Dict[str, torch.Tensor], instances: List[Instances] + ): + """ + Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. + + This is useful for downstream tasks where a box is known, but need to obtain + other attributes (outputs of other heads). + Test-time augmentation also uses this. + + Args: + features: same as in `forward()` + instances (list[Instances]): instances to predict other outputs. Expect the keys + "pred_boxes" and "pred_classes" to exist. + + Returns: + instances (list[Instances]): + the same `Instances` objects, with extra + fields such as `pred_masks` or `pred_keypoints`. + """ + + instances = super().forward_with_given_boxes(features, instances) + instances = self._forward_densepose(features, instances) + return instances diff --git a/Leffa/densepose/modeling/roi_heads/v1convx.py b/Leffa/densepose/modeling/roi_heads/v1convx.py new file mode 100644 index 0000000000000000000000000000000000000000..df79f658d8f7149e44aa1a31072adc4dadd89a48 --- /dev/null +++ b/Leffa/densepose/modeling/roi_heads/v1convx.py @@ -0,0 +1,64 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.layers import Conv2d + +from ..utils import initialize_module_params +from .registry import ROI_DENSEPOSE_HEAD_REGISTRY + + +@ROI_DENSEPOSE_HEAD_REGISTRY.register() +class DensePoseV1ConvXHead(nn.Module): + """ + Fully convolutional DensePose head. + """ + + def __init__(self, cfg: CfgNode, input_channels: int): + """ + Initialize DensePose fully convolutional head + + Args: + cfg (CfgNode): configuration options + input_channels (int): number of input channels + """ + super(DensePoseV1ConvXHead, self).__init__() + # fmt: off + hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL + self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS + # fmt: on + pad_size = kernel_size // 2 + n_channels = input_channels + for i in range(self.n_stacked_convs): + layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size) + layer_name = self._get_layer_name(i) + self.add_module(layer_name, layer) + n_channels = hidden_dim + self.n_out_channels = n_channels + initialize_module_params(self) + + def forward(self, features: torch.Tensor): + """ + Apply DensePose fully convolutional head to the input features + + Args: + features (tensor): input features + Result: + A tensor of DensePose head outputs + """ + x = features + output = x + for i in range(self.n_stacked_convs): + layer_name = self._get_layer_name(i) + x = getattr(self, layer_name)(x) + x = F.relu(x) + output = x + return output + + def _get_layer_name(self, i: int): + layer_name = "body_conv_fcn{}".format(i + 1) + return layer_name diff --git a/Leffa/densepose/modeling/test_time_augmentation.py b/Leffa/densepose/modeling/test_time_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..ec2022ed16727f538993d2c7db60a60a1183b90d --- /dev/null +++ b/Leffa/densepose/modeling/test_time_augmentation.py @@ -0,0 +1,207 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import copy +import numpy as np +import torch +from fvcore.transforms import HFlipTransform, TransformList +from torch.nn import functional as F + +from detectron2.data.transforms import RandomRotation, RotationTransform, apply_transform_gens +from detectron2.modeling.postprocessing import detector_postprocess +from detectron2.modeling.test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA + +from ..converters import HFlipConverter + + +class DensePoseDatasetMapperTTA(DatasetMapperTTA): + def __init__(self, cfg): + super().__init__(cfg=cfg) + self.angles = cfg.TEST.AUG.ROTATION_ANGLES + + def __call__(self, dataset_dict): + ret = super().__call__(dataset_dict=dataset_dict) + numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy() + for angle in self.angles: + rotate = RandomRotation(angle=angle, expand=True) + new_numpy_image, tfms = apply_transform_gens([rotate], np.copy(numpy_image)) + torch_image = torch.from_numpy(np.ascontiguousarray(new_numpy_image.transpose(2, 0, 1))) + dic = copy.deepcopy(dataset_dict) + # In DatasetMapperTTA, there is a pre_tfm transform (resize or no-op) that is + # added at the beginning of each TransformList. That's '.transforms[0]'. + dic["transforms"] = TransformList( + [ret[-1]["transforms"].transforms[0]] + tfms.transforms + ) + dic["image"] = torch_image + ret.append(dic) + return ret + + +class DensePoseGeneralizedRCNNWithTTA(GeneralizedRCNNWithTTA): + def __init__(self, cfg, model, transform_data, tta_mapper=None, batch_size=1): + """ + Args: + cfg (CfgNode): + model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on. + transform_data (DensePoseTransformData): contains symmetry label + transforms used for horizontal flip + tta_mapper (callable): takes a dataset dict and returns a list of + augmented versions of the dataset dict. Defaults to + `DatasetMapperTTA(cfg)`. + batch_size (int): batch the augmented images into this batch size for inference. + """ + self._transform_data = transform_data.to(model.device) + super().__init__(cfg=cfg, model=model, tta_mapper=tta_mapper, batch_size=batch_size) + + # the implementation follows closely the one from detectron2/modeling + def _inference_one_image(self, input): + """ + Args: + input (dict): one dataset dict with "image" field being a CHW tensor + + Returns: + dict: one output dict + """ + orig_shape = (input["height"], input["width"]) + # For some reason, resize with uint8 slightly increases box AP but decreases densepose AP + input["image"] = input["image"].to(torch.uint8) + augmented_inputs, tfms = self._get_augmented_inputs(input) + # Detect boxes from all augmented versions + with self._turn_off_roi_heads(["mask_on", "keypoint_on", "densepose_on"]): + # temporarily disable roi heads + all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms) + merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape) + + if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.DENSEPOSE_ON: + # Use the detected boxes to obtain new fields + augmented_instances = self._rescale_detected_boxes( + augmented_inputs, merged_instances, tfms + ) + # run forward on the detected boxes + outputs = self._batch_inference(augmented_inputs, augmented_instances) + # Delete now useless variables to avoid being out of memory + del augmented_inputs, augmented_instances + # average the predictions + if self.cfg.MODEL.MASK_ON: + merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms) + if self.cfg.MODEL.DENSEPOSE_ON: + merged_instances.pred_densepose = self._reduce_pred_densepose(outputs, tfms) + # postprocess + merged_instances = detector_postprocess(merged_instances, *orig_shape) + return {"instances": merged_instances} + else: + return {"instances": merged_instances} + + def _get_augmented_boxes(self, augmented_inputs, tfms): + # Heavily based on detectron2/modeling/test_time_augmentation.py + # Only difference is that RotationTransform is excluded from bbox computation + # 1: forward with all augmented images + outputs = self._batch_inference(augmented_inputs) + # 2: union the results + all_boxes = [] + all_scores = [] + all_classes = [] + for output, tfm in zip(outputs, tfms): + # Need to inverse the transforms on boxes, to obtain results on original image + if not any(isinstance(t, RotationTransform) for t in tfm.transforms): + # Some transforms can't compute bbox correctly + pred_boxes = output.pred_boxes.tensor + original_pred_boxes = tfm.inverse().apply_box(pred_boxes.cpu().numpy()) + all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device)) + all_scores.extend(output.scores) + all_classes.extend(output.pred_classes) + all_boxes = torch.cat(all_boxes, dim=0) + return all_boxes, all_scores, all_classes + + def _reduce_pred_densepose(self, outputs, tfms): + # Should apply inverse transforms on densepose preds. + # We assume only rotation, resize & flip are used. pred_masks is a scale-invariant + # representation, so we handle the other ones specially + for idx, (output, tfm) in enumerate(zip(outputs, tfms)): + for t in tfm.transforms: + for attr in ["coarse_segm", "fine_segm", "u", "v"]: + setattr( + output.pred_densepose, + attr, + _inverse_rotation( + getattr(output.pred_densepose, attr), output.pred_boxes.tensor, t + ), + ) + if any(isinstance(t, HFlipTransform) for t in tfm.transforms): + output.pred_densepose = HFlipConverter.convert( + output.pred_densepose, self._transform_data + ) + self._incremental_avg_dp(outputs[0].pred_densepose, output.pred_densepose, idx) + return outputs[0].pred_densepose + + # incrementally computed average: u_(n + 1) = u_n + (x_(n+1) - u_n) / (n + 1). + def _incremental_avg_dp(self, avg, new_el, idx): + for attr in ["coarse_segm", "fine_segm", "u", "v"]: + setattr(avg, attr, (getattr(avg, attr) * idx + getattr(new_el, attr)) / (idx + 1)) + if idx: + # Deletion of the > 0 index intermediary values to prevent GPU OOM + setattr(new_el, attr, None) + return avg + + +def _inverse_rotation(densepose_attrs, boxes, transform): + # resample outputs to image size and rotate back the densepose preds + # on the rotated images to the space of the original image + if len(boxes) == 0 or not isinstance(transform, RotationTransform): + return densepose_attrs + boxes = boxes.int().cpu().numpy() + wh_boxes = boxes[:, 2:] - boxes[:, :2] # bboxes in the rotated space + inv_boxes = rotate_box_inverse(transform, boxes).astype(int) # bboxes in original image + wh_diff = (inv_boxes[:, 2:] - inv_boxes[:, :2] - wh_boxes) // 2 # diff between new/old bboxes + rotation_matrix = torch.tensor([transform.rm_image]).to(device=densepose_attrs.device).float() + rotation_matrix[:, :, -1] = 0 + # To apply grid_sample for rotation, we need to have enough space to fit the original and + # rotated bboxes. l_bds and r_bds are the left/right bounds that will be used to + # crop the difference once the rotation is done + l_bds = np.maximum(0, -wh_diff) + for i in range(len(densepose_attrs)): + if min(wh_boxes[i]) <= 0: + continue + densepose_attr = densepose_attrs[[i]].clone() + # 1. Interpolate densepose attribute to size of the rotated bbox + densepose_attr = F.interpolate(densepose_attr, wh_boxes[i].tolist()[::-1], mode="bilinear") + # 2. Pad the interpolated attribute so it has room for the original + rotated bbox + densepose_attr = F.pad(densepose_attr, tuple(np.repeat(np.maximum(0, wh_diff[i]), 2))) + # 3. Compute rotation grid and transform + grid = F.affine_grid(rotation_matrix, size=densepose_attr.shape) + densepose_attr = F.grid_sample(densepose_attr, grid) + # 4. Compute right bounds and crop the densepose_attr to the size of the original bbox + r_bds = densepose_attr.shape[2:][::-1] - l_bds[i] + densepose_attr = densepose_attr[:, :, l_bds[i][1] : r_bds[1], l_bds[i][0] : r_bds[0]] + if min(densepose_attr.shape) > 0: + # Interpolate back to the original size of the densepose attribute + densepose_attr = F.interpolate( + densepose_attr, densepose_attrs.shape[-2:], mode="bilinear" + ) + # Adding a very small probability to the background class to fill padded zones + densepose_attr[:, 0] += 1e-10 + densepose_attrs[i] = densepose_attr + return densepose_attrs + + +def rotate_box_inverse(rot_tfm, rotated_box): + """ + rotated_box is a N * 4 array of [x0, y0, x1, y1] boxes + When a bbox is rotated, it gets bigger, because we need to surround the tilted bbox + So when a bbox is rotated then inverse-rotated, it is much bigger than the original + This function aims to invert the rotation on the box, but also resize it to its original size + """ + # 1. Compute the inverse rotation of the rotated bboxes (bigger than it ) + invrot_box = rot_tfm.inverse().apply_box(rotated_box) + h, w = rotated_box[:, 3] - rotated_box[:, 1], rotated_box[:, 2] - rotated_box[:, 0] + ih, iw = invrot_box[:, 3] - invrot_box[:, 1], invrot_box[:, 2] - invrot_box[:, 0] + assert 2 * rot_tfm.abs_sin**2 != 1, "45 degrees angle can't be inverted" + # 2. Inverse the corresponding computation in the rotation transform + # to get the original height/width of the rotated boxes + orig_h = (h * rot_tfm.abs_cos - w * rot_tfm.abs_sin) / (1 - 2 * rot_tfm.abs_sin**2) + orig_w = (w * rot_tfm.abs_cos - h * rot_tfm.abs_sin) / (1 - 2 * rot_tfm.abs_sin**2) + # 3. Resize the inverse-rotated bboxes to their original size + invrot_box[:, 0] += (iw - orig_w) / 2 + invrot_box[:, 1] += (ih - orig_h) / 2 + invrot_box[:, 2] -= (iw - orig_w) / 2 + invrot_box[:, 3] -= (ih - orig_h) / 2 + + return invrot_box diff --git a/Leffa/densepose/modeling/utils.py b/Leffa/densepose/modeling/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2e76eb9535a68dcb4ccb065556c55289294e42c8 --- /dev/null +++ b/Leffa/densepose/modeling/utils.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +from torch import nn + + +def initialize_module_params(module: nn.Module) -> None: + for name, param in module.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0) + elif "weight" in name: + nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") diff --git a/Leffa/densepose/utils/__init__.py b/Leffa/densepose/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Leffa/densepose/utils/dbhelper.py b/Leffa/densepose/utils/dbhelper.py new file mode 100644 index 0000000000000000000000000000000000000000..772e31874b2f65da9ae8b4e03c7515d5af282586 --- /dev/null +++ b/Leffa/densepose/utils/dbhelper.py @@ -0,0 +1,147 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from typing import Any, Dict, Optional, Tuple + + +class EntrySelector: + """ + Base class for entry selectors + """ + + @staticmethod + def from_string(spec: str) -> "EntrySelector": + if spec == "*": + return AllEntrySelector() + return FieldEntrySelector(spec) + + +class AllEntrySelector(EntrySelector): + """ + Selector that accepts all entries + """ + + SPECIFIER = "*" + + def __call__(self, entry): + return True + + +class FieldEntrySelector(EntrySelector): + """ + Selector that accepts only entries that match provided field + specifier(s). Only a limited set of specifiers is supported for now: + ::=[] + ::=[] + is a valid identifier + ::= "int" | "str" + ::= "=" + ::= "," + ::= ":" + ::= | + ::= + ::= "-" + is a string without spaces and special symbols + (e.g. , , , ) + """ + + _SPEC_DELIM = "," + _TYPE_DELIM = ":" + _RANGE_DELIM = "-" + _EQUAL = "=" + _ERROR_PREFIX = "Invalid field selector specifier" + + class _FieldEntryValuePredicate: + """ + Predicate that checks strict equality for the specified entry field + """ + + def __init__(self, name: str, typespec: Optional[str], value: str): + import builtins + + self.name = name + self.type = getattr(builtins, typespec) if typespec is not None else str + self.value = value + + def __call__(self, entry): + return entry[self.name] == self.type(self.value) + + class _FieldEntryRangePredicate: + """ + Predicate that checks whether an entry field falls into the specified range + """ + + def __init__(self, name: str, typespec: Optional[str], vmin: str, vmax: str): + import builtins + + self.name = name + self.type = getattr(builtins, typespec) if typespec is not None else str + self.vmin = vmin + self.vmax = vmax + + def __call__(self, entry): + return (entry[self.name] >= self.type(self.vmin)) and ( + entry[self.name] <= self.type(self.vmax) + ) + + def __init__(self, spec: str): + self._predicates = self._parse_specifier_into_predicates(spec) + + def __call__(self, entry: Dict[str, Any]): + for predicate in self._predicates: + if not predicate(entry): + return False + return True + + def _parse_specifier_into_predicates(self, spec: str): + predicates = [] + specs = spec.split(self._SPEC_DELIM) + for subspec in specs: + eq_idx = subspec.find(self._EQUAL) + if eq_idx > 0: + field_name_with_type = subspec[:eq_idx] + field_name, field_type = self._parse_field_name_type(field_name_with_type) + field_value_or_range = subspec[eq_idx + 1 :] + if self._is_range_spec(field_value_or_range): + vmin, vmax = self._get_range_spec(field_value_or_range) + predicate = FieldEntrySelector._FieldEntryRangePredicate( + field_name, field_type, vmin, vmax + ) + else: + predicate = FieldEntrySelector._FieldEntryValuePredicate( + field_name, field_type, field_value_or_range + ) + predicates.append(predicate) + elif eq_idx == 0: + self._parse_error(f'"{subspec}", field name is empty!') + else: + self._parse_error(f'"{subspec}", should have format ' "=!") + return predicates + + def _parse_field_name_type(self, field_name_with_type: str) -> Tuple[str, Optional[str]]: + type_delim_idx = field_name_with_type.find(self._TYPE_DELIM) + if type_delim_idx > 0: + field_name = field_name_with_type[:type_delim_idx] + field_type = field_name_with_type[type_delim_idx + 1 :] + elif type_delim_idx == 0: + self._parse_error(f'"{field_name_with_type}", field name is empty!') + else: + field_name = field_name_with_type + field_type = None + # pyre-fixme[61]: `field_name` may not be initialized here. + # pyre-fixme[61]: `field_type` may not be initialized here. + return field_name, field_type + + def _is_range_spec(self, field_value_or_range): + delim_idx = field_value_or_range.find(self._RANGE_DELIM) + return delim_idx > 0 + + def _get_range_spec(self, field_value_or_range): + if self._is_range_spec(field_value_or_range): + delim_idx = field_value_or_range.find(self._RANGE_DELIM) + vmin = field_value_or_range[:delim_idx] + vmax = field_value_or_range[delim_idx + 1 :] + return vmin, vmax + else: + self._parse_error('"field_value_or_range", range of values expected!') + + def _parse_error(self, msg): + raise ValueError(f"{self._ERROR_PREFIX}: {msg}") diff --git a/Leffa/densepose/utils/logger.py b/Leffa/densepose/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..70cd3cb0eb0fc7495b1a4b50a05725a0e5b1baba --- /dev/null +++ b/Leffa/densepose/utils/logger.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging + + +def verbosity_to_level(verbosity) -> int: + if verbosity is not None: + if verbosity == 0: + return logging.WARNING + elif verbosity == 1: + return logging.INFO + elif verbosity >= 2: + return logging.DEBUG + return logging.WARNING diff --git a/Leffa/densepose/utils/transform.py b/Leffa/densepose/utils/transform.py new file mode 100644 index 0000000000000000000000000000000000000000..8dc4ae7be878302ec39b7f235e3ae1b7a3ca29ee --- /dev/null +++ b/Leffa/densepose/utils/transform.py @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from detectron2.data import MetadataCatalog +from detectron2.utils.file_io import PathManager + +from densepose import DensePoseTransformData + + +def load_for_dataset(dataset_name): + path = MetadataCatalog.get(dataset_name).densepose_transform_src + densepose_transform_data_fpath = PathManager.get_local_path(path) + return DensePoseTransformData.load(densepose_transform_data_fpath) + + +def load_from_cfg(cfg): + return load_for_dataset(cfg.DATASETS.TEST[0]) diff --git a/Leffa/leffa_utils/densepose_for_mask.py b/Leffa/leffa_utils/densepose_for_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..2042b4e0087fa28700594ded2cfd49ccf7fd2257 --- /dev/null +++ b/Leffa/leffa_utils/densepose_for_mask.py @@ -0,0 +1,170 @@ +import glob +import os +import shutil +import time +from random import randint + +import cv2 +import numpy as np +import torch +from densepose import add_densepose_config +from densepose.vis.base import CompoundVisualizer +from densepose.vis.densepose_results import DensePoseResultsFineSegmentationVisualizer +from densepose.vis.extractor import CompoundExtractor, create_extractor +from detectron2.config import get_cfg +from detectron2.data.detection_utils import read_image +from detectron2.engine.defaults import DefaultPredictor +from PIL import Image + + +class DensePose: + """ + DensePose used in this project is from Detectron2 (https://github.com/facebookresearch/detectron2). + These codes are modified from https://github.com/facebookresearch/detectron2/tree/main/projects/DensePose. + The checkpoint is downloaded from https://github.com/facebookresearch/detectron2/blob/main/projects/DensePose/doc/DENSEPOSE_IUV.md#ModelZoo. + + We use the model R_50_FPN_s1x with id 165712039, but other models should also work. + The config file is downloaded from https://github.com/facebookresearch/detectron2/tree/main/projects/DensePose/configs. + Noted that the config file should match the model checkpoint and Base-DensePose-RCNN-FPN.yaml is also needed. + """ + + def __init__(self, model_path="./checkpoints/densepose_", device="cuda"): + self.device = device + self.config_path = os.path.join(model_path, "densepose_rcnn_R_50_FPN_s1x.yaml") + self.model_path = os.path.join(model_path, "model_final_162be9.pkl") + self.visualizations = ["dp_segm"] + self.VISUALIZERS = {"dp_segm": DensePoseResultsFineSegmentationVisualizer} + self.min_score = 0.8 + + self.cfg = self.setup_config() + self.predictor = DefaultPredictor(self.cfg) + self.predictor.model.to(self.device) + + def setup_config(self): + opts = ["MODEL.ROI_HEADS.SCORE_THRESH_TEST", str(self.min_score)] + cfg = get_cfg() + add_densepose_config(cfg) + cfg.merge_from_file(self.config_path) + cfg.merge_from_list(opts) + cfg.MODEL.WEIGHTS = self.model_path + cfg.freeze() + return cfg + + @staticmethod + def _get_input_file_list(input_spec: str): + if os.path.isdir(input_spec): + file_list = [ + os.path.join(input_spec, fname) + for fname in os.listdir(input_spec) + if os.path.isfile(os.path.join(input_spec, fname)) + ] + elif os.path.isfile(input_spec): + file_list = [input_spec] + else: + file_list = glob.glob(input_spec) + return file_list + + def create_context(self, cfg, output_path): + vis_specs = self.visualizations + visualizers = [] + extractors = [] + for vis_spec in vis_specs: + texture_atlas = texture_atlases_dict = None + vis = self.VISUALIZERS[vis_spec]( + cfg=cfg, + texture_atlas=texture_atlas, + texture_atlases_dict=texture_atlases_dict, + alpha=1.0, + ) + visualizers.append(vis) + extractor = create_extractor(vis) + extractors.append(extractor) + visualizer = CompoundVisualizer(visualizers) + extractor = CompoundExtractor(extractors) + context = { + "extractor": extractor, + "visualizer": visualizer, + "out_fname": output_path, + "entry_idx": 0, + } + return context + + def execute_on_outputs(self, context, entry, outputs): + extractor = context["extractor"] + + data = extractor(outputs) + + H, W, _ = entry["image"].shape + result = np.zeros((H, W), dtype=np.uint8) + + data, box = data[0] + x, y, w, h = [int(_) for _ in box[0].cpu().numpy()] + i_array = data[0].labels[None].cpu().numpy()[0] + result[y : y + h, x : x + w] = i_array + result = Image.fromarray(result) + result.save(context["out_fname"]) + + def __call__(self, image_or_path, resize=512) -> Image.Image: + """ + :param image_or_path: Path of the input image. + :param resize: Resize the input image if its max size is larger than this value. + :return: Dense pose image. + """ + # random tmp path with timestamp + tmp_path = f"./densepose_/tmp/" + if not os.path.exists(tmp_path): + os.makedirs(tmp_path) + + image_path = os.path.join( + tmp_path, f"{int(time.time())}-{self.device}-{randint(0, 100000)}.png" + ) + if isinstance(image_or_path, str): + assert image_or_path.split(".")[-1] in [ + "jpg", + "png", + ], "Only support jpg and png images." + shutil.copy(image_or_path, image_path) + elif isinstance(image_or_path, Image.Image): + image_or_path.save(image_path) + else: + shutil.rmtree(tmp_path) + raise TypeError("image_path must be str or PIL.Image.Image") + + output_path = image_path.replace(".png", "_dense.png").replace( + ".jpg", "_dense.png" + ) + w, h = Image.open(image_path).size + + file_list = self._get_input_file_list(image_path) + assert len(file_list), "No input images found!" + context = self.create_context(self.cfg, output_path) + for file_name in file_list: + img = read_image(file_name, format="BGR") # predictor expects BGR image. + # resize + if (_ := max(img.shape)) > resize: + scale = resize / _ + img = cv2.resize( + img, (int(img.shape[1] * scale), int(img.shape[0] * scale)) + ) + + with torch.no_grad(): + outputs = self.predictor(img)["instances"] + try: + self.execute_on_outputs( + context, {"file_name": file_name, "image": img}, outputs + ) + except Exception as e: + null_gray = Image.new("L", (1, 1)) + null_gray.save(output_path) + + dense_gray = Image.open(output_path).convert("L") + dense_gray = dense_gray.resize((w, h), Image.NEAREST) + # remove image_path and output_path + os.remove(image_path) + os.remove(output_path) + + return dense_gray + + +if __name__ == "__main__": + pass diff --git a/Leffa/leffa_utils/densepose_predictor.py b/Leffa/leffa_utils/densepose_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..db085b3901319c49a27f7bfdb990c8bd632ab960 --- /dev/null +++ b/Leffa/leffa_utils/densepose_predictor.py @@ -0,0 +1,77 @@ +import numpy as np +import torch +from densepose import add_densepose_config +from densepose.vis.densepose_results import ( + DensePoseResultsFineSegmentationVisualizer as Visualizer, +) +from densepose.vis.extractor import DensePoseResultExtractor +from detectron2.config import get_cfg +from detectron2.engine import DefaultPredictor + + +class DensePosePredictor(object): + def __init__(self, + config_path="./ckpts/densepose/densepose_rcnn_R_50_FPN_s1x.yaml", + weights_path="./ckpts/densepose/model_final_162be9.pkl" + ): + cfg = get_cfg() + add_densepose_config(cfg) + cfg.merge_from_file( + config_path) # Use the path to the config file from densepose + cfg.MODEL.WEIGHTS = weights_path # Use the path to the pre-trained model weights + cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 # Adjust as needed + self.predictor = DefaultPredictor(cfg) + self.extractor = DensePoseResultExtractor() + self.visualizer = Visualizer() + + def predict(self, image): + if isinstance(image, str): + image = cv2.imread(image) + with torch.no_grad(): + outputs = self.predictor(image)["instances"] + outputs = self.extractor(outputs) + return outputs + + def predict_iuv(self, image): + outputs = self.predict(image) + + img_i = outputs[0][0].labels[None, ...] + img_uv = outputs[0][0].uv + img_uv = (img_uv - img_uv.min()) / (img_uv.max() - img_uv.min()) + img_uv *= 255 + img_iuv = torch.cat([img_i, img_uv], dim=0) + img_iuv = img_iuv.permute(1, 2, 0) + img_iuv = img_iuv.cpu().numpy() + + position = [int(x) for x in outputs[1][0].cpu().numpy().tolist()] + x1, y1, w, h = position + x2 = x1 + w + y2 = y1 + h + image_iuv = np.zeros(image.shape, dtype=image.dtype) + image_iuv[y1:y2, x1:x2, :] = img_iuv + image_iuv = image_iuv[:, :, [0, 2, 1]] + + return image_iuv + + def predict_seg(self, image): + outputs = self.predict(image) + + image_seg = np.zeros(image.shape, dtype=image.dtype) + self.visualizer.visualize(image_seg, outputs) + + return image_seg + + +if __name__ == "__main__": + import sys + + import cv2 + + image_path = sys.argv[1] + image = cv2.imread(image_path) + predictor = DensePosePredictor() + image_iuv = predictor.predict_iuv(image) + image_seg = predictor.predict_seg(image) + cv2.imwrite(".".join(image_path.split(".")[:-1]) + "_iuv.jpg", image_iuv) + cv2.imwrite(".".join(image_path.split(".")[:-1]) + "_seg.jpg", image_seg) diff --git a/Leffa/leffa_utils/garment_agnostic_mask_predictor.py b/Leffa/leffa_utils/garment_agnostic_mask_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..55c41b34534296e42aa40c7404a968ffcbb76261 --- /dev/null +++ b/Leffa/leffa_utils/garment_agnostic_mask_predictor.py @@ -0,0 +1,415 @@ +import os +from typing import Union + +import cv2 +import numpy as np +import torch +from diffusers.image_processor import VaeImageProcessor +from PIL import Image +from SCHP import SCHP # type: ignore + +from leffa_utils.densepose_for_mask import DensePose # type: ignore + +DENSE_INDEX_MAP = { + "background": [0], + "torso": [1, 2], + "right hand": [3], + "left hand": [4], + "right foot": [5], + "left foot": [6], + "right thigh": [7, 9], + "left thigh": [8, 10], + "right leg": [11, 13], + "left leg": [12, 14], + "left big arm": [15, 17], + "right big arm": [16, 18], + "left forearm": [19, 21], + "right forearm": [20, 22], + "face": [23, 24], + "thighs": [7, 8, 9, 10], + "legs": [11, 12, 13, 14], + "hands": [3, 4], + "feet": [5, 6], + "big arms": [15, 16, 17, 18], + "forearms": [19, 20, 21, 22], +} + +ATR_MAPPING = { + "Background": 0, + "Hat": 1, + "Hair": 2, + "Sunglasses": 3, + "Upper-clothes": 4, + "Skirt": 5, + "Pants": 6, + "Dress": 7, + "Belt": 8, + "Left-shoe": 9, + "Right-shoe": 10, + "Face": 11, + "Left-leg": 12, + "Right-leg": 13, + "Left-arm": 14, + "Right-arm": 15, + "Bag": 16, + "Scarf": 17, +} + +LIP_MAPPING = { + "Background": 0, + "Hat": 1, + "Hair": 2, + "Glove": 3, + "Sunglasses": 4, + "Upper-clothes": 5, + "Dress": 6, + "Coat": 7, + "Socks": 8, + "Pants": 9, + "Jumpsuits": 10, + "Scarf": 11, + "Skirt": 12, + "Face": 13, + "Left-arm": 14, + "Right-arm": 15, + "Left-leg": 16, + "Right-leg": 17, + "Left-shoe": 18, + "Right-shoe": 19, +} + +PROTECT_BODY_PARTS = { + "upper": ["Left-leg", "Right-leg"], + "lower": ["Right-arm", "Left-arm", "Face"], + "overall": [], + "inner": ["Left-leg", "Right-leg"], + "outer": ["Left-leg", "Right-leg"], +} +PROTECT_CLOTH_PARTS = { + "upper": {"ATR": ["Skirt", "Pants"], "LIP": ["Skirt", "Pants"]}, + "lower": {"ATR": ["Upper-clothes"], "LIP": ["Upper-clothes", "Coat"]}, + "overall": {"ATR": [], "LIP": []}, + "inner": { + "ATR": ["Dress", "Coat", "Skirt", "Pants"], + "LIP": ["Dress", "Coat", "Skirt", "Pants", "Jumpsuits"], + }, + "outer": { + "ATR": ["Dress", "Pants", "Skirt"], + "LIP": ["Upper-clothes", "Dress", "Pants", "Skirt", "Jumpsuits"], + }, +} +MASK_CLOTH_PARTS = { + "upper": ["Upper-clothes", "Coat", "Dress", "Jumpsuits"], + "lower": ["Pants", "Skirt", "Dress", "Jumpsuits"], + "overall": ["Upper-clothes", "Dress", "Pants", "Skirt", "Coat", "Jumpsuits"], + "inner": ["Upper-clothes"], + "outer": [ + "Coat", + ], +} +MASK_DENSE_PARTS = { + "upper": ["torso", "big arms", "forearms"], + "lower": ["thighs", "legs"], + "overall": ["torso", "thighs", "legs", "big arms", "forearms"], + "inner": ["torso"], + "outer": ["torso", "big arms", "forearms"], +} + +schp_public_protect_parts = [ + "Hat", + "Hair", + "Sunglasses", + "Left-shoe", + "Right-shoe", + "Bag", + "Glove", + "Scarf", +] +schp_protect_parts = { + "upper": ["Left-leg", "Right-leg", "Skirt", "Pants", "Jumpsuits"], + "lower": ["Left-arm", "Right-arm", "Upper-clothes", "Coat"], + "overall": [], + "inner": ["Left-leg", "Right-leg", "Skirt", "Pants", "Jumpsuits", "Coat"], + "outer": ["Left-leg", "Right-leg", "Skirt", "Pants", "Jumpsuits", "Upper-clothes"], +} +schp_mask_parts = { + "upper": ["Upper-clothes", "Dress", "Coat", "Jumpsuits"], + "lower": ["Pants", "Skirt", "Dress", "Jumpsuits", "socks"], + "overall": [ + "Upper-clothes", + "Dress", + "Pants", + "Skirt", + "Coat", + "Jumpsuits", + "socks", + ], + "inner": ["Upper-clothes"], + "outer": [ + "Coat", + ], +} + +dense_mask_parts = { + "upper": ["torso", "big arms", "forearms"], + "lower": ["thighs", "legs"], + "overall": ["torso", "thighs", "legs", "big arms", "forearms"], + "inner": ["torso"], + "outer": ["torso", "big arms", "forearms"], +} + + +def vis_mask(image, mask): + image = np.array(image).astype(np.uint8) + mask = np.array(mask).astype(np.uint8) + mask[mask > 127] = 255 + mask[mask <= 127] = 0 + mask = np.expand_dims(mask, axis=-1) + mask = np.repeat(mask, 3, axis=-1) + mask = mask / 255 + return Image.fromarray((image * (1 - mask)).astype(np.uint8)) + + +def part_mask_of(part: Union[str, list], parse: np.ndarray, mapping: dict): + if isinstance(part, str): + part = [part] + mask = np.zeros_like(parse) + for _ in part: + if _ not in mapping: + continue + if isinstance(mapping[_], list): + for i in mapping[_]: + mask += parse == i + else: + mask += parse == mapping[_] + return mask + + +def hull_mask(mask_area: np.ndarray): + ret, binary = cv2.threshold(mask_area, 127, 255, cv2.THRESH_BINARY) + contours, hierarchy = cv2.findContours( + binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + hull_mask = np.zeros_like(mask_area) + for c in contours: + hull = cv2.convexHull(c) + hull_mask = cv2.fillPoly(np.zeros_like(mask_area), [hull], 255) | hull_mask + return hull_mask + + +class AutoMasker: + def __init__( + self, + densepose_path: str = "./ckpts/densepose", + schp_path: str = "./ckpts/schp", + device="cuda", + ): + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed(0) + + self.densepose_processor = DensePose(densepose_path, device) + self.schp_processor_atr = SCHP( + ckpt_path=os.path.join(schp_path, "exp-schp-201908301523-atr.pth"), + device=device, + ) + self.schp_processor_lip = SCHP( + ckpt_path=os.path.join(schp_path, "exp-schp-201908261155-lip.pth"), + device=device, + ) + + self.mask_processor = VaeImageProcessor( + vae_scale_factor=8, + do_normalize=False, + do_binarize=True, + do_convert_grayscale=True, + ) + + def process_densepose(self, image_or_path): + return self.densepose_processor(image_or_path, resize=1024) + + def process_schp_lip(self, image_or_path): + return self.schp_processor_lip(image_or_path) + + def process_schp_atr(self, image_or_path): + return self.schp_processor_atr(image_or_path) + + def preprocess_image(self, image_or_path): + return { + "densepose": self.densepose_processor(image_or_path, resize=1024), + "schp_atr": self.schp_processor_atr(image_or_path), + "schp_lip": self.schp_processor_lip(image_or_path), + } + + @staticmethod + def cloth_agnostic_mask( + densepose_mask: Image.Image, + schp_lip_mask: Image.Image, + schp_atr_mask: Image.Image, + part: str = "overall", + **kwargs, + ): + assert part in [ + "upper", + "lower", + "overall", + "inner", + "outer", + ], f"part should be one of ['upper', 'lower', 'overall', 'inner', 'outer'], but got {part}" + w, h = densepose_mask.size + + dilate_kernel = max(w, h) // 250 + dilate_kernel = dilate_kernel if dilate_kernel % 2 == 1 else dilate_kernel + 1 + dilate_kernel = np.ones((dilate_kernel, dilate_kernel), np.uint8) + + kernal_size = max(w, h) // 25 + kernal_size = kernal_size if kernal_size % 2 == 1 else kernal_size + 1 + + densepose_mask = np.array(densepose_mask) + schp_lip_mask = np.array(schp_lip_mask) + schp_atr_mask = np.array(schp_atr_mask) + + # Strong Protect Area (Hands, Face, Accessory, Feet) + hands_protect_area = part_mask_of( + ["hands", "feet"], densepose_mask, DENSE_INDEX_MAP + ) + hands_protect_area = cv2.dilate(hands_protect_area, dilate_kernel, iterations=1) + hands_protect_area = hands_protect_area & ( + part_mask_of( + ["Left-arm", "Right-arm", "Left-leg", "Right-leg"], + schp_atr_mask, + ATR_MAPPING, + ) + | part_mask_of( + ["Left-arm", "Right-arm", "Left-leg", "Right-leg"], + schp_lip_mask, + LIP_MAPPING, + ) + ) + face_protect_area = part_mask_of("Face", schp_lip_mask, LIP_MAPPING) + + strong_protect_area = hands_protect_area | face_protect_area + + # Weak Protect Area (Hair, Irrelevant Clothes, Body Parts) + body_protect_area = part_mask_of( + PROTECT_BODY_PARTS[part], schp_lip_mask, LIP_MAPPING + ) | part_mask_of(PROTECT_BODY_PARTS[part], schp_atr_mask, ATR_MAPPING) + hair_protect_area = part_mask_of( + ["Hair"], schp_lip_mask, LIP_MAPPING + ) | part_mask_of(["Hair"], schp_atr_mask, ATR_MAPPING) + cloth_protect_area = part_mask_of( + PROTECT_CLOTH_PARTS[part]["LIP"], schp_lip_mask, LIP_MAPPING + ) | part_mask_of(PROTECT_CLOTH_PARTS[part]["ATR"], schp_atr_mask, ATR_MAPPING) + accessory_protect_area = part_mask_of( + ( + accessory_parts := [ + "Hat", + "Glove", + "Sunglasses", + "Bag", + "Left-shoe", + "Right-shoe", + "Scarf", + "Socks", + ] + ), + schp_lip_mask, + LIP_MAPPING, + ) | part_mask_of(accessory_parts, schp_atr_mask, ATR_MAPPING) + weak_protect_area = ( + body_protect_area + | cloth_protect_area + | hair_protect_area + | strong_protect_area + | accessory_protect_area + ) + + # Mask Area + strong_mask_area = part_mask_of( + MASK_CLOTH_PARTS[part], schp_lip_mask, LIP_MAPPING + ) | part_mask_of(MASK_CLOTH_PARTS[part], schp_atr_mask, ATR_MAPPING) + background_area = part_mask_of( + ["Background"], schp_lip_mask, LIP_MAPPING + ) & part_mask_of(["Background"], schp_atr_mask, ATR_MAPPING) + mask_dense_area = part_mask_of( + MASK_DENSE_PARTS[part], densepose_mask, DENSE_INDEX_MAP + ) + mask_dense_area = cv2.resize( + mask_dense_area.astype(np.uint8), + None, + fx=0.25, + fy=0.25, + interpolation=cv2.INTER_NEAREST, + ) + mask_dense_area = cv2.dilate(mask_dense_area, dilate_kernel, iterations=2) + mask_dense_area = cv2.resize( + mask_dense_area.astype(np.uint8), + None, + fx=4, + fy=4, + interpolation=cv2.INTER_NEAREST, + ) + + mask_area = ( + np.ones_like(densepose_mask) & (~weak_protect_area) & (~background_area) + ) | mask_dense_area + + mask_area = ( + hull_mask(mask_area * 255) // 255 + ) # Convex Hull to expand the mask area + mask_area = mask_area & (~weak_protect_area) + mask_area = cv2.GaussianBlur(mask_area * 255, (kernal_size, kernal_size), 0) + mask_area[mask_area < 25] = 0 + mask_area[mask_area >= 25] = 1 + mask_area = (mask_area | strong_mask_area) & (~strong_protect_area) + mask_area = cv2.dilate(mask_area, dilate_kernel, iterations=1) + + return Image.fromarray(mask_area * 255) + + def __call__( + self, + image: Union[str, Image.Image], + mask_type: str = "upper", + ): + assert mask_type in [ + "upper", + "lower", + "overall", + "inner", + "outer", + ], f"mask_type should be one of ['upper', 'lower', 'overall', 'inner', 'outer'], but got {mask_type}" + preprocess_results = self.preprocess_image(image) + mask = self.cloth_agnostic_mask( + preprocess_results["densepose"], + preprocess_results["schp_lip"], + preprocess_results["schp_atr"], + part=mask_type, + ) + return { + "mask": mask, + "densepose": preprocess_results["densepose"], + "schp_lip": preprocess_results["schp_lip"], + "schp_atr": preprocess_results["schp_atr"], + } + + +if __name__ == "__main__": + import os + import sys + + from PIL import Image + + automasker = AutoMasker() + + image_path = sys.argv[1] + image = Image.open(image_path).convert("RGB") + outputs = automasker( + image, + "upper", + # "lower", + ) + mask = outputs["mask"] + # densepose = outputs["densepose"] # densepose I map, range 0~24 + # schp_lip = outputs["schp_lip"] + # schp_atr = outputs["schp_atr"] + mask.save(".".join(image_path.split(".")[:-1]) + "_mask.jpg") diff --git a/Leffa/leffa_utils/utils.py b/Leffa/leffa_utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f328493df79b114ef9fe5f09af8bf2d206872025 --- /dev/null +++ b/Leffa/leffa_utils/utils.py @@ -0,0 +1,379 @@ +import os +import cv2 +import torch +import numpy as np +from numpy.linalg import lstsq +from PIL import Image, ImageDraw + + +def resize_and_center(image, target_width, target_height): + img = np.array(image) + + if img.shape[-1] == 4: + img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB) + elif len(img.shape) == 2 or img.shape[-1] == 1: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) + + original_height, original_width = img.shape[:2] + + scale = min(target_height / original_height, target_width / original_width) + new_height = int(original_height * scale) + new_width = int(original_width * scale) + + resized_img = cv2.resize(img, (new_width, new_height), + interpolation=cv2.INTER_CUBIC) + + padded_img = np.ones((target_height, target_width, 3), + dtype=np.uint8) * 255 + + top = (target_height - new_height) // 2 + left = (target_width - new_width) // 2 + + padded_img[top:top + new_height, left:left + new_width] = resized_img + + return Image.fromarray(padded_img) + + +def list_dir(folder_path): + # Collect all file paths within the directory + file_paths = [] + for root, _, files in os.walk(folder_path): + for file in files: + file_paths.append(os.path.join(root, file)) + + file_paths = sorted(file_paths) + return file_paths + + +label_map = { + "background": 0, + "hat": 1, + "hair": 2, + "sunglasses": 3, + "upper_clothes": 4, + "skirt": 5, + "pants": 6, + "dress": 7, + "belt": 8, + "left_shoe": 9, + "right_shoe": 10, + "head": 11, + "left_leg": 12, + "right_leg": 13, + "left_arm": 14, + "right_arm": 15, + "bag": 16, + "scarf": 17, + "neck": 18, +} + + +def extend_arm_mask(wrist, elbow, scale): + wrist = elbow + scale * (wrist - elbow) + return wrist + + +def hole_fill(img): + img = np.pad(img[1:-1, 1:-1], pad_width=1, + mode='constant', constant_values=0) + img_copy = img.copy() + mask = np.zeros((img.shape[0] + 2, img.shape[1] + 2), dtype=np.uint8) + + cv2.floodFill(img, mask, (0, 0), 255) + img_inverse = cv2.bitwise_not(img) + dst = cv2.bitwise_or(img_copy, img_inverse) + return dst + + +def refine_mask(mask): + contours, hierarchy = cv2.findContours(mask.astype(np.uint8), + cv2.RETR_CCOMP, cv2.CHAIN_APPROX_TC89_L1) + area = [] + for j in range(len(contours)): + a_d = cv2.contourArea(contours[j], True) + area.append(abs(a_d)) + refine_mask = np.zeros_like(mask).astype(np.uint8) + if len(area) != 0: + i = area.index(max(area)) + cv2.drawContours(refine_mask, contours, i, color=255, thickness=-1) + + return refine_mask + + +def get_agnostic_mask_hd(model_parse, keypoint, category, size=(384, 512)): + model_type = "hd" + ############################## + width, height = size + im_parse = model_parse.resize((width, height), Image.NEAREST) + parse_array = np.array(im_parse) + + if model_type == 'hd': + arm_width = 60 + elif model_type == 'dc': + arm_width = 45 + else: + raise ValueError("model_type must be \'hd\' or \'dc\'!") + + parse_head = (parse_array == 1).astype(np.float32) + \ + (parse_array == 3).astype(np.float32) + \ + (parse_array == 11).astype(np.float32) + + parser_mask_fixed = (parse_array == label_map["left_shoe"]).astype(np.float32) + \ + (parse_array == label_map["right_shoe"]).astype(np.float32) + \ + (parse_array == label_map["hat"]).astype(np.float32) + \ + (parse_array == label_map["sunglasses"]).astype(np.float32) + \ + (parse_array == label_map["bag"]).astype(np.float32) + + parser_mask_changeable = ( + parse_array == label_map["background"]).astype(np.float32) + + arms_left = (parse_array == 14).astype(np.float32) + arms_right = (parse_array == 15).astype(np.float32) + + if category == 'dresses': + parse_mask = (parse_array == 7).astype(np.float32) + \ + (parse_array == 4).astype(np.float32) + \ + (parse_array == 5).astype(np.float32) + \ + (parse_array == 6).astype(np.float32) + + parser_mask_changeable += np.logical_and( + parse_array, np.logical_not(parser_mask_fixed)) + + elif category == 'upper_body': + parse_mask = (parse_array == 4).astype(np.float32) + \ + (parse_array == 7).astype(np.float32) + parser_mask_fixed_lower_cloth = (parse_array == label_map["skirt"]).astype(np.float32) + \ + (parse_array == label_map["pants"]).astype( + np.float32) + parser_mask_fixed += parser_mask_fixed_lower_cloth + parser_mask_changeable += np.logical_and( + parse_array, np.logical_not(parser_mask_fixed)) + elif category == 'lower_body': + parse_mask = (parse_array == 6).astype(np.float32) + \ + (parse_array == 12).astype(np.float32) + \ + (parse_array == 13).astype(np.float32) + \ + (parse_array == 5).astype(np.float32) + parser_mask_fixed += (parse_array == label_map["upper_clothes"]).astype(np.float32) + \ + (parse_array == 14).astype(np.float32) + \ + (parse_array == 15).astype(np.float32) + parser_mask_changeable += np.logical_and( + parse_array, np.logical_not(parser_mask_fixed)) + else: + raise NotImplementedError + + # Load pose points + pose_data = keypoint["pose_keypoints_2d"] + pose_data = np.array(pose_data) + pose_data = pose_data.reshape((-1, 2)) + + im_arms_left = Image.new('L', (width, height)) + im_arms_right = Image.new('L', (width, height)) + arms_draw_left = ImageDraw.Draw(im_arms_left) + arms_draw_right = ImageDraw.Draw(im_arms_right) + if category == 'dresses' or category == 'upper_body': + shoulder_right = np.multiply(tuple(pose_data[2][:2]), height / 512.0) + shoulder_left = np.multiply(tuple(pose_data[5][:2]), height / 512.0) + elbow_right = np.multiply(tuple(pose_data[3][:2]), height / 512.0) + elbow_left = np.multiply(tuple(pose_data[6][:2]), height / 512.0) + wrist_right = np.multiply(tuple(pose_data[4][:2]), height / 512.0) + wrist_left = np.multiply(tuple(pose_data[7][:2]), height / 512.0) + ARM_LINE_WIDTH = int(arm_width / 512 * height) + size_left = [shoulder_left[0] - ARM_LINE_WIDTH // 2, shoulder_left[1] - ARM_LINE_WIDTH // + 2, shoulder_left[0] + ARM_LINE_WIDTH // 2, shoulder_left[1] + ARM_LINE_WIDTH // 2] + size_right = [shoulder_right[0] - ARM_LINE_WIDTH // 2, shoulder_right[1] - ARM_LINE_WIDTH // 2, shoulder_right[0] + ARM_LINE_WIDTH // 2, + shoulder_right[1] + ARM_LINE_WIDTH // 2] + + if wrist_right[0] <= 1. and wrist_right[1] <= 1.: + im_arms_right = arms_right + else: + wrist_right = extend_arm_mask(wrist_right, elbow_right, 1.2) + arms_draw_right.line(np.concatenate((shoulder_right, elbow_right, wrist_right)).astype( + np.uint16).tolist(), 'white', ARM_LINE_WIDTH, 'curve') + arms_draw_right.arc(size_right, 0, 360, + 'white', ARM_LINE_WIDTH // 2) + + if wrist_left[0] <= 1. and wrist_left[1] <= 1.: + im_arms_left = arms_left + else: + wrist_left = extend_arm_mask(wrist_left, elbow_left, 1.2) + arms_draw_left.line(np.concatenate((wrist_left, elbow_left, shoulder_left)).astype( + np.uint16).tolist(), 'white', ARM_LINE_WIDTH, 'curve') + arms_draw_left.arc(size_left, 0, 360, 'white', ARM_LINE_WIDTH // 2) + + hands_left = np.logical_and(np.logical_not(im_arms_left), arms_left) + hands_right = np.logical_and(np.logical_not(im_arms_right), arms_right) + parser_mask_fixed += hands_left + hands_right + + parser_mask_fixed = cv2.erode(parser_mask_fixed, np.ones( + (5, 5), np.uint16), iterations=1) + + parser_mask_fixed = np.logical_or(parser_mask_fixed, parse_head) + parse_mask = cv2.dilate(parse_mask, np.ones( + (10, 10), np.uint16), iterations=5) + if category == 'dresses' or category == 'upper_body': + neck_mask = (parse_array == 18).astype(np.float32) + neck_mask = cv2.dilate(neck_mask, np.ones( + (5, 5), np.uint16), iterations=1) + neck_mask = np.logical_and(neck_mask, np.logical_not(parse_head)) + parse_mask = np.logical_or(parse_mask, neck_mask) + arm_mask = cv2.dilate(np.logical_or(im_arms_left, im_arms_right).astype( + 'float32'), np.ones((5, 5), np.uint16), iterations=4) + parse_mask += np.logical_or(parse_mask, arm_mask) + + parse_mask = np.logical_and( + parser_mask_changeable, np.logical_not(parse_mask)) + + parse_mask_total = np.logical_or(parse_mask, parser_mask_fixed) + inpaint_mask = 1 - parse_mask_total + img = np.where(inpaint_mask, 255, 0) + dst = hole_fill(img.astype(np.uint8)) + dst = refine_mask(dst) + inpaint_mask = dst / 255 * 1 + mask = Image.fromarray(inpaint_mask.astype(np.uint8) * 255) + + return mask + + +def get_agnostic_mask_dc(model_parse, keypoint, category, size=(384, 512)): + parse_array = np.array(model_parse) + pose_data = keypoint["pose_keypoints_2d"] + pose_data = np.array(pose_data) + pose_data = pose_data.reshape((-1, 2)) + + parse_shape = (parse_array > 0).astype(np.float32) + + parse_head = (parse_array == 1).astype(np.float32) + \ + (parse_array == 2).astype(np.float32) + \ + (parse_array == 3).astype(np.float32) + \ + (parse_array == 11).astype(np.float32) + \ + (parse_array == 18).astype(np.float32) + + parser_mask_fixed = (parse_array == label_map["hair"]).astype(np.float32) + \ + (parse_array == label_map["left_shoe"]).astype(np.float32) + \ + (parse_array == label_map["right_shoe"]).astype(np.float32) + \ + (parse_array == label_map["hat"]).astype(np.float32) + \ + (parse_array == label_map["sunglasses"]).astype(np.float32) + \ + (parse_array == label_map["scarf"]).astype(np.float32) + \ + (parse_array == label_map["bag"]).astype(np.float32) + + parser_mask_changeable = ( + parse_array == label_map["background"]).astype(np.float32) + + arms = (parse_array == 14).astype(np.float32) + \ + (parse_array == 15).astype(np.float32) + + if category == 'dresses': + label_cat = 7 + parse_mask = (parse_array == 7).astype(np.float32) + \ + (parse_array == 12).astype(np.float32) + \ + (parse_array == 13).astype(np.float32) + parser_mask_changeable += np.logical_and( + parse_array, np.logical_not(parser_mask_fixed)) + + elif category == 'upper_body': + label_cat = 4 + parse_mask = (parse_array == 4).astype(np.float32) + + parser_mask_fixed += (parse_array == label_map["skirt"]).astype(np.float32) + \ + (parse_array == label_map["pants"]).astype(np.float32) + + parser_mask_changeable += np.logical_and( + parse_array, np.logical_not(parser_mask_fixed)) + elif category == 'lower_body': + label_cat = 6 + parse_mask = (parse_array == 6).astype(np.float32) + \ + (parse_array == 12).astype(np.float32) + \ + (parse_array == 13).astype(np.float32) + + parser_mask_fixed += (parse_array == label_map["upper_clothes"]).astype(np.float32) + \ + (parse_array == 14).astype(np.float32) + \ + (parse_array == 15).astype(np.float32) + parser_mask_changeable += np.logical_and( + parse_array, np.logical_not(parser_mask_fixed)) + + parse_head = torch.from_numpy(parse_head) # [0,1] + parse_mask = torch.from_numpy(parse_mask) # [0,1] + parser_mask_fixed = torch.from_numpy(parser_mask_fixed) + parser_mask_changeable = torch.from_numpy(parser_mask_changeable) + + # dilation + parse_without_cloth = np.logical_and( + parse_shape, np.logical_not(parse_mask)) + parse_mask = parse_mask.cpu().numpy() + + width = size[0] + height = size[1] + + im_arms = Image.new('L', (width, height)) + arms_draw = ImageDraw.Draw(im_arms) + if category == 'dresses' or category == 'upper_body': + shoulder_right = tuple(np.multiply(pose_data[2, :2], height / 512.0)) + shoulder_left = tuple(np.multiply(pose_data[5, :2], height / 512.0)) + elbow_right = tuple(np.multiply(pose_data[3, :2], height / 512.0)) + elbow_left = tuple(np.multiply(pose_data[6, :2], height / 512.0)) + wrist_right = tuple(np.multiply(pose_data[4, :2], height / 512.0)) + wrist_left = tuple(np.multiply(pose_data[7, :2], height / 512.0)) + if wrist_right[0] <= 1. and wrist_right[1] <= 1.: + if elbow_right[0] <= 1. and elbow_right[1] <= 1.: + arms_draw.line( + [wrist_left, elbow_left, shoulder_left, shoulder_right], 'white', 30, 'curve') + else: + arms_draw.line([wrist_left, elbow_left, shoulder_left, shoulder_right, elbow_right], 'white', 30, + 'curve') + elif wrist_left[0] <= 1. and wrist_left[1] <= 1.: + if elbow_left[0] <= 1. and elbow_left[1] <= 1.: + arms_draw.line([shoulder_left, shoulder_right, + elbow_right, wrist_right], 'white', 30, 'curve') + else: + arms_draw.line([elbow_left, shoulder_left, shoulder_right, elbow_right, wrist_right], 'white', 30, + 'curve') + else: + arms_draw.line([wrist_left, elbow_left, shoulder_left, shoulder_right, elbow_right, wrist_right], 'white', + 30, 'curve') + + if height > 512: + im_arms = cv2.dilate(np.float32(im_arms), np.ones( + (10, 10), np.uint16), iterations=5) + elif height > 256: + im_arms = cv2.dilate(np.float32(im_arms), np.ones( + (5, 5), np.uint16), iterations=5) + hands = np.logical_and(np.logical_not(im_arms), arms) + parse_mask += im_arms + parser_mask_fixed += hands + + # delete neck + parse_head_2 = torch.clone(parse_head) + if category == 'dresses' or category == 'upper_body': + points = [] + points.append(np.multiply(pose_data[2, :2], height / 512.0)) + points.append(np.multiply(pose_data[5, :2], height / 512.0)) + x_coords, y_coords = zip(*points) + A = np.vstack([x_coords, np.ones(len(x_coords))]).T + m, c = lstsq(A, y_coords, rcond=None)[0] + for i in range(parse_array.shape[1]): + y = i * m + c + parse_head_2[int(y - 20 * (height / 512.0)):, i] = 0 + + parser_mask_fixed = np.logical_or( + parser_mask_fixed, np.array(parse_head_2, dtype=np.uint16)) + parse_mask += np.logical_or(parse_mask, np.logical_and(np.array(parse_head, dtype=np.uint16), + np.logical_not(np.array(parse_head_2, dtype=np.uint16)))) + + if height > 512: + parse_mask = cv2.dilate(parse_mask, np.ones( + (20, 20), np.uint16), iterations=5) + elif height > 256: + parse_mask = cv2.dilate(parse_mask, np.ones( + (10, 10), np.uint16), iterations=5) + else: + parse_mask = cv2.dilate(parse_mask, np.ones( + (5, 5), np.uint16), iterations=5) + parse_mask = np.logical_and( + parser_mask_changeable, np.logical_not(parse_mask)) + parse_mask_total = np.logical_or(parse_mask, parser_mask_fixed) + inpaint_mask = 1 - parse_mask_total + img = np.where(inpaint_mask, 255, 0) + img = hole_fill(img.astype(np.uint8)) + inpaint_mask = img / 255 * 1 + mask = Image.fromarray(inpaint_mask.astype(np.uint8) * 255) + return mask diff --git a/Leffa/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile b/Leffa/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..2a8603903e36eafb3a61fac0a086a919cc67fe38 --- /dev/null +++ b/Leffa/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile @@ -0,0 +1,49 @@ +FROM nvidia/cuda:10.1-cudnn7-devel + +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get update && apt-get install -y \ + python3-opencv ca-certificates python3-dev git wget sudo \ + cmake ninja-build protobuf-compiler libprotobuf-dev && \ + rm -rf /var/lib/apt/lists/* +RUN ln -sv /usr/bin/python3 /usr/bin/python + +# create a non-root user +ARG USER_ID=1000 +RUN useradd -m --no-log-init --system --uid ${USER_ID} appuser -g sudo +RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +USER appuser +WORKDIR /home/appuser + +ENV PATH="/home/appuser/.local/bin:${PATH}" +RUN wget https://bootstrap.pypa.io/get-pip.py && \ + python3 get-pip.py --user && \ + rm get-pip.py + +# install dependencies +# See https://pytorch.org/ for other options if you use a different version of CUDA +RUN pip install --user tensorboard cython +RUN pip install --user torch==1.5+cu101 torchvision==0.6+cu101 -f https://download.pytorch.org/whl/torch_stable.html +RUN pip install --user 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI' + +RUN pip install --user 'git+https://github.com/facebookresearch/fvcore' +# install detectron2 +RUN git clone https://github.com/facebookresearch/detectron2 detectron2_repo +# set FORCE_CUDA because during `docker build` cuda is not accessible +ENV FORCE_CUDA="1" +# This will by default build detectron2 for all common cuda architectures and take a lot more time, +# because inside `docker build`, there is no way to tell which architecture will be used. +ARG TORCH_CUDA_ARCH_LIST="Kepler;Kepler+Tesla;Maxwell;Maxwell+Tegra;Pascal;Volta;Turing" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" + +RUN pip install --user -e detectron2_repo + +# Set a fixed model cache directory. +ENV FVCORE_CACHE="/tmp" +WORKDIR /home/appuser/detectron2_repo + +# run detectron2 under user "appuser": +# wget http://images.cocodataset.org/val2017/000000439715.jpg -O input.jpg +# python3 demo/demo.py \ + #--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \ + #--input input.jpg --output outputs/ \ + #--opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl diff --git a/Leffa/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile-circleci b/Leffa/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile-circleci new file mode 100644 index 0000000000000000000000000000000000000000..bc0be845adc247eb458d212ae5352c594cd80a72 --- /dev/null +++ b/Leffa/preprocess/humanparsing/mhp_extension/detectron2/docker/Dockerfile-circleci @@ -0,0 +1,17 @@ +FROM nvidia/cuda:10.1-cudnn7-devel +# This dockerfile only aims to provide an environment for unittest on CircleCI + +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get update && apt-get install -y \ + python3-opencv ca-certificates python3-dev git wget sudo ninja-build && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -q https://bootstrap.pypa.io/get-pip.py && \ + python3 get-pip.py && \ + rm get-pip.py + +# install dependencies +# See https://pytorch.org/ for other options if you use a different version of CUDA +RUN pip install tensorboard cython +RUN pip install torch==1.5+cu101 torchvision==0.6+cu101 -f https://download.pytorch.org/whl/torch_stable.html +RUN pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI' diff --git a/Leffa/preprocess/humanparsing/mhp_extension/detectron2/docker/README.md b/Leffa/preprocess/humanparsing/mhp_extension/detectron2/docker/README.md new file mode 100644 index 0000000000000000000000000000000000000000..760c4054d0e4fa56a67ab4b59c14979498e2f94a --- /dev/null +++ b/Leffa/preprocess/humanparsing/mhp_extension/detectron2/docker/README.md @@ -0,0 +1,36 @@ + +## Use the container (with docker ≥ 19.03) + +``` +cd docker/ +# Build: +docker build --build-arg USER_ID=$UID -t detectron2:v0 . +# Run: +docker run --gpus all -it \ + --shm-size=8gb --env="DISPLAY" --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \ + --name=detectron2 detectron2:v0 + +# Grant docker access to host X server to show images +xhost +local:`docker inspect --format='{{ .Config.Hostname }}' detectron2` +``` + +## Use the container (with docker < 19.03) + +Install docker-compose and nvidia-docker2, then run: +``` +cd docker && USER_ID=$UID docker-compose run detectron2 +``` + +#### Using a persistent cache directory + +You can prevent models from being re-downloaded on every run, +by storing them in a cache directory. + +To do this, add `--volume=$HOME/.torch/fvcore_cache:/tmp:rw` in the run command. + +## Install new dependencies +Add the following to `Dockerfile` to make persistent changes. +``` +RUN sudo apt-get update && sudo apt-get install -y vim +``` +Or run them in the container to make temporary changes. diff --git a/Leffa/preprocess/humanparsing/modules/__init__.py b/Leffa/preprocess/humanparsing/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8a098dee5911f3613d320d23db37bc401cf57fa4 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/__init__.py @@ -0,0 +1,5 @@ +from .bn import ABN, InPlaceABN, InPlaceABNSync +from .functions import ACT_RELU, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE +from .misc import GlobalAvgPool2d, SingleGPU +from .residual import IdentityResidualBlock +from .dense import DenseModule diff --git a/Leffa/preprocess/humanparsing/modules/bn.py b/Leffa/preprocess/humanparsing/modules/bn.py new file mode 100644 index 0000000000000000000000000000000000000000..a794698867e89140a030d550d832e6fa12561c8b --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/bn.py @@ -0,0 +1,132 @@ +import torch +import torch.nn as nn +import torch.nn.functional as functional + +try: + from queue import Queue +except ImportError: + from Queue import Queue + +from .functions import * + + +class ABN(nn.Module): + """Activated Batch Normalization + + This gathers a `BatchNorm2d` and an activation function in a single module + """ + + def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01): + """Creates an Activated Batch Normalization module + + Parameters + ---------- + num_features : int + Number of feature channels in the input and output. + eps : float + Small constant to prevent numerical issues. + momentum : float + Momentum factor applied to compute running statistics as. + affine : bool + If `True` apply learned scale and shift transformation after normalization. + activation : str + Name of the activation functions, one of: `leaky_relu`, `elu` or `none`. + slope : float + Negative slope for the `leaky_relu` activation. + """ + super(ABN, self).__init__() + self.num_features = num_features + self.affine = affine + self.eps = eps + self.momentum = momentum + self.activation = activation + self.slope = slope + if self.affine: + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + else: + self.register_parameter('weight', None) + self.register_parameter('bias', None) + self.register_buffer('running_mean', torch.zeros(num_features)) + self.register_buffer('running_var', torch.ones(num_features)) + self.reset_parameters() + + def reset_parameters(self): + nn.init.constant_(self.running_mean, 0) + nn.init.constant_(self.running_var, 1) + if self.affine: + nn.init.constant_(self.weight, 1) + nn.init.constant_(self.bias, 0) + + def forward(self, x): + x = functional.batch_norm(x, self.running_mean, self.running_var, self.weight, self.bias, + self.training, self.momentum, self.eps) + + if self.activation == ACT_RELU: + return functional.relu(x, inplace=True) + elif self.activation == ACT_LEAKY_RELU: + return functional.leaky_relu(x, negative_slope=self.slope, inplace=True) + elif self.activation == ACT_ELU: + return functional.elu(x, inplace=True) + else: + return x + + def __repr__(self): + rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \ + ' affine={affine}, activation={activation}' + if self.activation == "leaky_relu": + rep += ', slope={slope})' + else: + rep += ')' + return rep.format(name=self.__class__.__name__, **self.__dict__) + + +class InPlaceABN(ABN): + """InPlace Activated Batch Normalization""" + + def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, activation="leaky_relu", slope=0.01): + """Creates an InPlace Activated Batch Normalization module + + Parameters + ---------- + num_features : int + Number of feature channels in the input and output. + eps : float + Small constant to prevent numerical issues. + momentum : float + Momentum factor applied to compute running statistics as. + affine : bool + If `True` apply learned scale and shift transformation after normalization. + activation : str + Name of the activation functions, one of: `leaky_relu`, `elu` or `none`. + slope : float + Negative slope for the `leaky_relu` activation. + """ + super(InPlaceABN, self).__init__(num_features, eps, momentum, affine, activation, slope) + + def forward(self, x): + x, _, _ = inplace_abn(x, self.weight, self.bias, self.running_mean, self.running_var, + self.training, self.momentum, self.eps, self.activation, self.slope) + return x + + +class InPlaceABNSync(ABN): + """InPlace Activated Batch Normalization with cross-GPU synchronization + This assumes that it will be replicated across GPUs using the same mechanism as in `nn.DistributedDataParallel`. + """ + + def forward(self, x): + x, _, _ = inplace_abn_sync(x, self.weight, self.bias, self.running_mean, self.running_var, + self.training, self.momentum, self.eps, self.activation, self.slope) + return x + + def __repr__(self): + rep = '{name}({num_features}, eps={eps}, momentum={momentum},' \ + ' affine={affine}, activation={activation}' + if self.activation == "leaky_relu": + rep += ', slope={slope})' + else: + rep += ')' + return rep.format(name=self.__class__.__name__, **self.__dict__) + + diff --git a/Leffa/preprocess/humanparsing/modules/deeplab.py b/Leffa/preprocess/humanparsing/modules/deeplab.py new file mode 100644 index 0000000000000000000000000000000000000000..fd25b78369b27ef02c183a0b17b9bf8354c5f7c3 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/deeplab.py @@ -0,0 +1,84 @@ +import torch +import torch.nn as nn +import torch.nn.functional as functional + +from models._util import try_index +from .bn import ABN + + +class DeeplabV3(nn.Module): + def __init__(self, + in_channels, + out_channels, + hidden_channels=256, + dilations=(12, 24, 36), + norm_act=ABN, + pooling_size=None): + super(DeeplabV3, self).__init__() + self.pooling_size = pooling_size + + self.map_convs = nn.ModuleList([ + nn.Conv2d(in_channels, hidden_channels, 1, bias=False), + nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[0], padding=dilations[0]), + nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[1], padding=dilations[1]), + nn.Conv2d(in_channels, hidden_channels, 3, bias=False, dilation=dilations[2], padding=dilations[2]) + ]) + self.map_bn = norm_act(hidden_channels * 4) + + self.global_pooling_conv = nn.Conv2d(in_channels, hidden_channels, 1, bias=False) + self.global_pooling_bn = norm_act(hidden_channels) + + self.red_conv = nn.Conv2d(hidden_channels * 4, out_channels, 1, bias=False) + self.pool_red_conv = nn.Conv2d(hidden_channels, out_channels, 1, bias=False) + self.red_bn = norm_act(out_channels) + + self.reset_parameters(self.map_bn.activation, self.map_bn.slope) + + def reset_parameters(self, activation, slope): + gain = nn.init.calculate_gain(activation, slope) + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.xavier_normal_(m.weight.data, gain) + if hasattr(m, "bias") and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, ABN): + if hasattr(m, "weight") and m.weight is not None: + nn.init.constant_(m.weight, 1) + if hasattr(m, "bias") and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + # Map convolutions + out = torch.cat([m(x) for m in self.map_convs], dim=1) + out = self.map_bn(out) + out = self.red_conv(out) + + # Global pooling + pool = self._global_pooling(x) + pool = self.global_pooling_conv(pool) + pool = self.global_pooling_bn(pool) + pool = self.pool_red_conv(pool) + if self.training or self.pooling_size is None: + pool = pool.repeat(1, 1, x.size(2), x.size(3)) + + out += pool + out = self.red_bn(out) + return out + + def _global_pooling(self, x): + if self.training or self.pooling_size is None: + pool = x.view(x.size(0), x.size(1), -1).mean(dim=-1) + pool = pool.view(x.size(0), x.size(1), 1, 1) + else: + pooling_size = (min(try_index(self.pooling_size, 0), x.shape[2]), + min(try_index(self.pooling_size, 1), x.shape[3])) + padding = ( + (pooling_size[1] - 1) // 2, + (pooling_size[1] - 1) // 2 if pooling_size[1] % 2 == 1 else (pooling_size[1] - 1) // 2 + 1, + (pooling_size[0] - 1) // 2, + (pooling_size[0] - 1) // 2 if pooling_size[0] % 2 == 1 else (pooling_size[0] - 1) // 2 + 1 + ) + + pool = functional.avg_pool2d(x, pooling_size, stride=1) + pool = functional.pad(pool, pad=padding, mode="replicate") + return pool diff --git a/Leffa/preprocess/humanparsing/modules/dense.py b/Leffa/preprocess/humanparsing/modules/dense.py new file mode 100644 index 0000000000000000000000000000000000000000..9638d6e86d2ae838550fefa9002a984af52e6cc8 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/dense.py @@ -0,0 +1,42 @@ +from collections import OrderedDict + +import torch +import torch.nn as nn + +from .bn import ABN + + +class DenseModule(nn.Module): + def __init__(self, in_channels, growth, layers, bottleneck_factor=4, norm_act=ABN, dilation=1): + super(DenseModule, self).__init__() + self.in_channels = in_channels + self.growth = growth + self.layers = layers + + self.convs1 = nn.ModuleList() + self.convs3 = nn.ModuleList() + for i in range(self.layers): + self.convs1.append(nn.Sequential(OrderedDict([ + ("bn", norm_act(in_channels)), + ("conv", nn.Conv2d(in_channels, self.growth * bottleneck_factor, 1, bias=False)) + ]))) + self.convs3.append(nn.Sequential(OrderedDict([ + ("bn", norm_act(self.growth * bottleneck_factor)), + ("conv", nn.Conv2d(self.growth * bottleneck_factor, self.growth, 3, padding=dilation, bias=False, + dilation=dilation)) + ]))) + in_channels += self.growth + + @property + def out_channels(self): + return self.in_channels + self.growth * self.layers + + def forward(self, x): + inputs = [x] + for i in range(self.layers): + x = torch.cat(inputs, dim=1) + x = self.convs1[i](x) + x = self.convs3[i](x) + inputs += [x] + + return torch.cat(inputs, dim=1) diff --git a/Leffa/preprocess/humanparsing/modules/functions.py b/Leffa/preprocess/humanparsing/modules/functions.py new file mode 100644 index 0000000000000000000000000000000000000000..4b2837260687dde56d4595b24aded5fddbc4bda8 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/functions.py @@ -0,0 +1,245 @@ +import pdb +from os import path +import torch +import torch.distributed as dist +import torch.autograd as autograd +import torch.cuda.comm as comm +from torch.autograd.function import once_differentiable +from torch.utils.cpp_extension import load + +_src_path = path.join(path.dirname(path.abspath(__file__)), "src") +_backend = load(name="inplace_abn", + extra_cflags=["-O3"], + sources=[path.join(_src_path, f) for f in [ + "inplace_abn.cpp", + "inplace_abn_cpu.cpp", + "inplace_abn_cuda.cu", + "inplace_abn_cuda_half.cu" + ]], + extra_cuda_cflags=["--expt-extended-lambda"]) + +# Activation names +ACT_RELU = "relu" +ACT_LEAKY_RELU = "leaky_relu" +ACT_ELU = "elu" +ACT_NONE = "none" + + +def _check(fn, *args, **kwargs): + success = fn(*args, **kwargs) + if not success: + raise RuntimeError("CUDA Error encountered in {}".format(fn)) + + +def _broadcast_shape(x): + out_size = [] + for i, s in enumerate(x.size()): + if i != 1: + out_size.append(1) + else: + out_size.append(s) + return out_size + + +def _reduce(x): + if len(x.size()) == 2: + return x.sum(dim=0) + else: + n, c = x.size()[0:2] + return x.contiguous().view((n, c, -1)).sum(2).sum(0) + + +def _count_samples(x): + count = 1 + for i, s in enumerate(x.size()): + if i != 1: + count *= s + return count + + +def _act_forward(ctx, x): + if ctx.activation == ACT_LEAKY_RELU: + _backend.leaky_relu_forward(x, ctx.slope) + elif ctx.activation == ACT_ELU: + _backend.elu_forward(x) + elif ctx.activation == ACT_NONE: + pass + + +def _act_backward(ctx, x, dx): + if ctx.activation == ACT_LEAKY_RELU: + _backend.leaky_relu_backward(x, dx, ctx.slope) + elif ctx.activation == ACT_ELU: + _backend.elu_backward(x, dx) + elif ctx.activation == ACT_NONE: + pass + + +class InPlaceABN(autograd.Function): + @staticmethod + def forward(ctx, x, weight, bias, running_mean, running_var, + training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01): + # Save context + ctx.training = training + ctx.momentum = momentum + ctx.eps = eps + ctx.activation = activation + ctx.slope = slope + ctx.affine = weight is not None and bias is not None + + # Prepare inputs + count = _count_samples(x) + x = x.contiguous() + weight = weight.contiguous() if ctx.affine else x.new_empty(0) + bias = bias.contiguous() if ctx.affine else x.new_empty(0) + + if ctx.training: + mean, var = _backend.mean_var(x) + + # Update running stats + running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean) + running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * count / (count - 1)) + + # Mark in-place modified tensors + ctx.mark_dirty(x, running_mean, running_var) + else: + mean, var = running_mean.contiguous(), running_var.contiguous() + ctx.mark_dirty(x) + + # BN forward + activation + _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps) + _act_forward(ctx, x) + + # Output + ctx.var = var + ctx.save_for_backward(x, var, weight, bias) + ctx.mark_non_differentiable(running_mean, running_var) + return x, running_mean, running_var + + @staticmethod + @once_differentiable + def backward(ctx, dz, _drunning_mean, _drunning_var): + z, var, weight, bias = ctx.saved_tensors + dz = dz.contiguous() + + # Undo activation + _act_backward(ctx, z, dz) + + if ctx.training: + edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps) + else: + # TODO: implement simplified CUDA backward for inference mode + edz = dz.new_zeros(dz.size(1)) + eydz = dz.new_zeros(dz.size(1)) + + dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps) + # dweight = eydz * weight.sign() if ctx.affine else None + dweight = eydz if ctx.affine else None + if dweight is not None: + dweight[weight < 0] *= -1 + dbias = edz if ctx.affine else None + + return dx, dweight, dbias, None, None, None, None, None, None, None + + +class InPlaceABNSync(autograd.Function): + @classmethod + def forward(cls, ctx, x, weight, bias, running_mean, running_var, + training=True, momentum=0.1, eps=1e-05, activation=ACT_LEAKY_RELU, slope=0.01, equal_batches=True): + # Save context + ctx.training = training + ctx.momentum = momentum + ctx.eps = eps + ctx.activation = activation + ctx.slope = slope + ctx.affine = weight is not None and bias is not None + + # Prepare inputs + ctx.world_size = dist.get_world_size() if dist.is_initialized() else 1 + + # count = _count_samples(x) + batch_size = x.new_tensor([x.shape[0]], dtype=torch.long) + + x = x.contiguous() + weight = weight.contiguous() if ctx.affine else x.new_empty(0) + bias = bias.contiguous() if ctx.affine else x.new_empty(0) + + if ctx.training: + mean, var = _backend.mean_var(x) + if ctx.world_size > 1: + # get global batch size + if equal_batches: + batch_size *= ctx.world_size + else: + dist.all_reduce(batch_size, dist.ReduceOp.SUM) + + ctx.factor = x.shape[0] / float(batch_size.item()) + + mean_all = mean.clone() * ctx.factor + dist.all_reduce(mean_all, dist.ReduceOp.SUM) + + var_all = (var + (mean - mean_all) ** 2) * ctx.factor + dist.all_reduce(var_all, dist.ReduceOp.SUM) + + mean = mean_all + var = var_all + + # Update running stats + running_mean.mul_((1 - ctx.momentum)).add_(ctx.momentum * mean) + count = batch_size.item() * x.view(x.shape[0], x.shape[1], -1).shape[-1] + running_var.mul_((1 - ctx.momentum)).add_(ctx.momentum * var * (float(count) / (count - 1))) + + # Mark in-place modified tensors + ctx.mark_dirty(x, running_mean, running_var) + else: + mean, var = running_mean.contiguous(), running_var.contiguous() + ctx.mark_dirty(x) + + # BN forward + activation + _backend.forward(x, mean, var, weight, bias, ctx.affine, ctx.eps) + _act_forward(ctx, x) + + # Output + ctx.var = var + ctx.save_for_backward(x, var, weight, bias) + ctx.mark_non_differentiable(running_mean, running_var) + return x, running_mean, running_var + + @staticmethod + @once_differentiable + def backward(ctx, dz, _drunning_mean, _drunning_var): + z, var, weight, bias = ctx.saved_tensors + dz = dz.contiguous() + + # Undo activation + _act_backward(ctx, z, dz) + + if ctx.training: + edz, eydz = _backend.edz_eydz(z, dz, weight, bias, ctx.affine, ctx.eps) + edz_local = edz.clone() + eydz_local = eydz.clone() + + if ctx.world_size > 1: + edz *= ctx.factor + dist.all_reduce(edz, dist.ReduceOp.SUM) + + eydz *= ctx.factor + dist.all_reduce(eydz, dist.ReduceOp.SUM) + else: + edz_local = edz = dz.new_zeros(dz.size(1)) + eydz_local = eydz = dz.new_zeros(dz.size(1)) + + dx = _backend.backward(z, dz, var, weight, bias, edz, eydz, ctx.affine, ctx.eps) + # dweight = eydz_local * weight.sign() if ctx.affine else None + dweight = eydz_local if ctx.affine else None + if dweight is not None: + dweight[weight < 0] *= -1 + dbias = edz_local if ctx.affine else None + + return dx, dweight, dbias, None, None, None, None, None, None, None + + +inplace_abn = InPlaceABN.apply +inplace_abn_sync = InPlaceABNSync.apply + +__all__ = ["inplace_abn", "inplace_abn_sync", "ACT_RELU", "ACT_LEAKY_RELU", "ACT_ELU", "ACT_NONE"] diff --git a/Leffa/preprocess/humanparsing/modules/misc.py b/Leffa/preprocess/humanparsing/modules/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..3c50b69b38c950801baacba8b3684ffd23aef08b --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/misc.py @@ -0,0 +1,21 @@ +import torch.nn as nn +import torch +import torch.distributed as dist + +class GlobalAvgPool2d(nn.Module): + def __init__(self): + """Global average pooling over the input's spatial dimensions""" + super(GlobalAvgPool2d, self).__init__() + + def forward(self, inputs): + in_size = inputs.size() + return inputs.view((in_size[0], in_size[1], -1)).mean(dim=2) + +class SingleGPU(nn.Module): + def __init__(self, module): + super(SingleGPU, self).__init__() + self.module=module + + def forward(self, input): + return self.module(input.cuda(non_blocking=True)) + diff --git a/Leffa/preprocess/humanparsing/modules/residual.py b/Leffa/preprocess/humanparsing/modules/residual.py new file mode 100644 index 0000000000000000000000000000000000000000..8a5c90e0606a451ff690f67a2feac28476241d86 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/residual.py @@ -0,0 +1,182 @@ +from collections import OrderedDict + +import torch.nn as nn + +from .bn import ABN, ACT_LEAKY_RELU, ACT_ELU, ACT_NONE +import torch.nn.functional as functional + + +class ResidualBlock(nn.Module): + """Configurable residual block + + Parameters + ---------- + in_channels : int + Number of input channels. + channels : list of int + Number of channels in the internal feature maps. Can either have two or three elements: if three construct + a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then + `3 x 3` then `1 x 1` convolutions. + stride : int + Stride of the first `3 x 3` convolution + dilation : int + Dilation to apply to the `3 x 3` convolutions. + groups : int + Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with + bottleneck blocks. + norm_act : callable + Function to create normalization / activation Module. + dropout: callable + Function to create Dropout Module. + """ + + def __init__(self, + in_channels, + channels, + stride=1, + dilation=1, + groups=1, + norm_act=ABN, + dropout=None): + super(ResidualBlock, self).__init__() + + # Check parameters for inconsistencies + if len(channels) != 2 and len(channels) != 3: + raise ValueError("channels must contain either two or three values") + if len(channels) == 2 and groups != 1: + raise ValueError("groups > 1 are only valid if len(channels) == 3") + + is_bottleneck = len(channels) == 3 + need_proj_conv = stride != 1 or in_channels != channels[-1] + + if not is_bottleneck: + bn2 = norm_act(channels[1]) + bn2.activation = ACT_NONE + layers = [ + ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False, + dilation=dilation)), + ("bn1", norm_act(channels[0])), + ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False, + dilation=dilation)), + ("bn2", bn2) + ] + if dropout is not None: + layers = layers[0:2] + [("dropout", dropout())] + layers[2:] + else: + bn3 = norm_act(channels[2]) + bn3.activation = ACT_NONE + layers = [ + ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=1, padding=0, bias=False)), + ("bn1", norm_act(channels[0])), + ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=stride, padding=dilation, bias=False, + groups=groups, dilation=dilation)), + ("bn2", norm_act(channels[1])), + ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)), + ("bn3", bn3) + ] + if dropout is not None: + layers = layers[0:4] + [("dropout", dropout())] + layers[4:] + self.convs = nn.Sequential(OrderedDict(layers)) + + if need_proj_conv: + self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False) + self.proj_bn = norm_act(channels[-1]) + self.proj_bn.activation = ACT_NONE + + def forward(self, x): + if hasattr(self, "proj_conv"): + residual = self.proj_conv(x) + residual = self.proj_bn(residual) + else: + residual = x + x = self.convs(x) + residual + + if self.convs.bn1.activation == ACT_LEAKY_RELU: + return functional.leaky_relu(x, negative_slope=self.convs.bn1.slope, inplace=True) + elif self.convs.bn1.activation == ACT_ELU: + return functional.elu(x, inplace=True) + else: + return x + + +class IdentityResidualBlock(nn.Module): + def __init__(self, + in_channels, + channels, + stride=1, + dilation=1, + groups=1, + norm_act=ABN, + dropout=None): + """Configurable identity-mapping residual block + + Parameters + ---------- + in_channels : int + Number of input channels. + channels : list of int + Number of channels in the internal feature maps. Can either have two or three elements: if three construct + a residual block with two `3 x 3` convolutions, otherwise construct a bottleneck block with `1 x 1`, then + `3 x 3` then `1 x 1` convolutions. + stride : int + Stride of the first `3 x 3` convolution + dilation : int + Dilation to apply to the `3 x 3` convolutions. + groups : int + Number of convolution groups. This is used to create ResNeXt-style blocks and is only compatible with + bottleneck blocks. + norm_act : callable + Function to create normalization / activation Module. + dropout: callable + Function to create Dropout Module. + """ + super(IdentityResidualBlock, self).__init__() + + # Check parameters for inconsistencies + if len(channels) != 2 and len(channels) != 3: + raise ValueError("channels must contain either two or three values") + if len(channels) == 2 and groups != 1: + raise ValueError("groups > 1 are only valid if len(channels) == 3") + + is_bottleneck = len(channels) == 3 + need_proj_conv = stride != 1 or in_channels != channels[-1] + + self.bn1 = norm_act(in_channels) + if not is_bottleneck: + layers = [ + ("conv1", nn.Conv2d(in_channels, channels[0], 3, stride=stride, padding=dilation, bias=False, + dilation=dilation)), + ("bn2", norm_act(channels[0])), + ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False, + dilation=dilation)) + ] + if dropout is not None: + layers = layers[0:2] + [("dropout", dropout())] + layers[2:] + else: + layers = [ + ("conv1", nn.Conv2d(in_channels, channels[0], 1, stride=stride, padding=0, bias=False)), + ("bn2", norm_act(channels[0])), + ("conv2", nn.Conv2d(channels[0], channels[1], 3, stride=1, padding=dilation, bias=False, + groups=groups, dilation=dilation)), + ("bn3", norm_act(channels[1])), + ("conv3", nn.Conv2d(channels[1], channels[2], 1, stride=1, padding=0, bias=False)) + ] + if dropout is not None: + layers = layers[0:4] + [("dropout", dropout())] + layers[4:] + self.convs = nn.Sequential(OrderedDict(layers)) + + if need_proj_conv: + self.proj_conv = nn.Conv2d(in_channels, channels[-1], 1, stride=stride, padding=0, bias=False) + + def forward(self, x): + if hasattr(self, "proj_conv"): + bn1 = self.bn1(x) + shortcut = self.proj_conv(bn1) + else: + shortcut = x.clone() + bn1 = self.bn1(x) + + out = self.convs(bn1) + out.add_(shortcut) + + return out diff --git a/Leffa/preprocess/humanparsing/modules/src/checks.h b/Leffa/preprocess/humanparsing/modules/src/checks.h new file mode 100644 index 0000000000000000000000000000000000000000..e761a6fe34d0789815b588eba7e3726026e0e868 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/src/checks.h @@ -0,0 +1,15 @@ +#pragma once + +#include + +// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT +#ifndef AT_CHECK +#define AT_CHECK AT_ASSERT +#endif + +#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor") +#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous") + +#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) +#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x) \ No newline at end of file diff --git a/Leffa/preprocess/humanparsing/modules/src/inplace_abn.cpp b/Leffa/preprocess/humanparsing/modules/src/inplace_abn.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a6b1128cc20cbfc476134154e23e5869a92b856 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/src/inplace_abn.cpp @@ -0,0 +1,95 @@ +#include + +#include + +#include "inplace_abn.h" + +std::vector mean_var(at::Tensor x) { + if (x.is_cuda()) { + if (x.type().scalarType() == at::ScalarType::Half) { + return mean_var_cuda_h(x); + } else { + return mean_var_cuda(x); + } + } else { + return mean_var_cpu(x); + } +} + +at::Tensor forward(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias, + bool affine, float eps) { + if (x.is_cuda()) { + if (x.type().scalarType() == at::ScalarType::Half) { + return forward_cuda_h(x, mean, var, weight, bias, affine, eps); + } else { + return forward_cuda(x, mean, var, weight, bias, affine, eps); + } + } else { + return forward_cpu(x, mean, var, weight, bias, affine, eps); + } +} + +std::vector edz_eydz(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias, + bool affine, float eps) { + if (z.is_cuda()) { + if (z.type().scalarType() == at::ScalarType::Half) { + return edz_eydz_cuda_h(z, dz, weight, bias, affine, eps); + } else { + return edz_eydz_cuda(z, dz, weight, bias, affine, eps); + } + } else { + return edz_eydz_cpu(z, dz, weight, bias, affine, eps); + } +} + +at::Tensor backward(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias, + at::Tensor edz, at::Tensor eydz, bool affine, float eps) { + if (z.is_cuda()) { + if (z.type().scalarType() == at::ScalarType::Half) { + return backward_cuda_h(z, dz, var, weight, bias, edz, eydz, affine, eps); + } else { + return backward_cuda(z, dz, var, weight, bias, edz, eydz, affine, eps); + } + } else { + return backward_cpu(z, dz, var, weight, bias, edz, eydz, affine, eps); + } +} + +void leaky_relu_forward(at::Tensor z, float slope) { + at::leaky_relu_(z, slope); +} + +void leaky_relu_backward(at::Tensor z, at::Tensor dz, float slope) { + if (z.is_cuda()) { + if (z.type().scalarType() == at::ScalarType::Half) { + return leaky_relu_backward_cuda_h(z, dz, slope); + } else { + return leaky_relu_backward_cuda(z, dz, slope); + } + } else { + return leaky_relu_backward_cpu(z, dz, slope); + } +} + +void elu_forward(at::Tensor z) { + at::elu_(z); +} + +void elu_backward(at::Tensor z, at::Tensor dz) { + if (z.is_cuda()) { + return elu_backward_cuda(z, dz); + } else { + return elu_backward_cpu(z, dz); + } +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("mean_var", &mean_var, "Mean and variance computation"); + m.def("forward", &forward, "In-place forward computation"); + m.def("edz_eydz", &edz_eydz, "First part of backward computation"); + m.def("backward", &backward, "Second part of backward computation"); + m.def("leaky_relu_forward", &leaky_relu_forward, "Leaky relu forward computation"); + m.def("leaky_relu_backward", &leaky_relu_backward, "Leaky relu backward computation and inversion"); + m.def("elu_forward", &elu_forward, "Elu forward computation"); + m.def("elu_backward", &elu_backward, "Elu backward computation and inversion"); +} diff --git a/Leffa/preprocess/humanparsing/modules/src/inplace_abn.h b/Leffa/preprocess/humanparsing/modules/src/inplace_abn.h new file mode 100644 index 0000000000000000000000000000000000000000..17afd1196449ecb6376f28961e54b55e1537492f --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/src/inplace_abn.h @@ -0,0 +1,88 @@ +#pragma once + +#include + +#include + +std::vector mean_var_cpu(at::Tensor x); +std::vector mean_var_cuda(at::Tensor x); +std::vector mean_var_cuda_h(at::Tensor x); + +at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias, + bool affine, float eps); +at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias, + bool affine, float eps); +at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias, + bool affine, float eps); + +std::vector edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias, + bool affine, float eps); +std::vector edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias, + bool affine, float eps); +std::vector edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias, + bool affine, float eps); + +at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias, + at::Tensor edz, at::Tensor eydz, bool affine, float eps); +at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias, + at::Tensor edz, at::Tensor eydz, bool affine, float eps); +at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias, + at::Tensor edz, at::Tensor eydz, bool affine, float eps); + +void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope); +void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope); +void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope); + +void elu_backward_cpu(at::Tensor z, at::Tensor dz); +void elu_backward_cuda(at::Tensor z, at::Tensor dz); + +static void get_dims(at::Tensor x, int64_t& num, int64_t& chn, int64_t& sp) { + num = x.size(0); + chn = x.size(1); + sp = 1; + for (int64_t i = 2; i < x.ndimension(); ++i) + sp *= x.size(i); +} + +/* + * Specialized CUDA reduction functions for BN + */ +#ifdef __CUDACC__ + +#include "utils/cuda.cuh" + +template +__device__ T reduce(Op op, int plane, int N, int S) { + T sum = (T)0; + for (int batch = 0; batch < N; ++batch) { + for (int x = threadIdx.x; x < S; x += blockDim.x) { + sum += op(batch, plane, x); + } + } + + // sum over NumThreads within a warp + sum = warpSum(sum); + + // 'transpose', and reduce within warp again + __shared__ T shared[32]; + __syncthreads(); + if (threadIdx.x % WARP_SIZE == 0) { + shared[threadIdx.x / WARP_SIZE] = sum; + } + if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) { + // zero out the other entries in shared + shared[threadIdx.x] = (T)0; + } + __syncthreads(); + if (threadIdx.x / WARP_SIZE == 0) { + sum = warpSum(shared[threadIdx.x]); + if (threadIdx.x == 0) { + shared[0] = sum; + } + } + __syncthreads(); + + // Everyone picks it up, should be broadcast into the whole gradInput + return shared[0]; +} +#endif diff --git a/Leffa/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp b/Leffa/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ffc6d38c52ea31661b8dd438dc3fe1958f50b61e --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/src/inplace_abn_cpu.cpp @@ -0,0 +1,119 @@ +#include + +#include + +#include "utils/checks.h" +#include "inplace_abn.h" + +at::Tensor reduce_sum(at::Tensor x) { + if (x.ndimension() == 2) { + return x.sum(0); + } else { + auto x_view = x.view({x.size(0), x.size(1), -1}); + return x_view.sum(-1).sum(0); + } +} + +at::Tensor broadcast_to(at::Tensor v, at::Tensor x) { + if (x.ndimension() == 2) { + return v; + } else { + std::vector broadcast_size = {1, -1}; + for (int64_t i = 2; i < x.ndimension(); ++i) + broadcast_size.push_back(1); + + return v.view(broadcast_size); + } +} + +int64_t count(at::Tensor x) { + int64_t count = x.size(0); + for (int64_t i = 2; i < x.ndimension(); ++i) + count *= x.size(i); + + return count; +} + +at::Tensor invert_affine(at::Tensor z, at::Tensor weight, at::Tensor bias, bool affine, float eps) { + if (affine) { + return (z - broadcast_to(bias, z)) / broadcast_to(at::abs(weight) + eps, z); + } else { + return z; + } +} + +std::vector mean_var_cpu(at::Tensor x) { + auto num = count(x); + auto mean = reduce_sum(x) / num; + auto diff = x - broadcast_to(mean, x); + auto var = reduce_sum(diff.pow(2)) / num; + + return {mean, var}; +} + +at::Tensor forward_cpu(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias, + bool affine, float eps) { + auto gamma = affine ? at::abs(weight) + eps : at::ones_like(var); + auto mul = at::rsqrt(var + eps) * gamma; + + x.sub_(broadcast_to(mean, x)); + x.mul_(broadcast_to(mul, x)); + if (affine) x.add_(broadcast_to(bias, x)); + + return x; +} + +std::vector edz_eydz_cpu(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias, + bool affine, float eps) { + auto edz = reduce_sum(dz); + auto y = invert_affine(z, weight, bias, affine, eps); + auto eydz = reduce_sum(y * dz); + + return {edz, eydz}; +} + +at::Tensor backward_cpu(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias, + at::Tensor edz, at::Tensor eydz, bool affine, float eps) { + auto y = invert_affine(z, weight, bias, affine, eps); + auto mul = affine ? at::rsqrt(var + eps) * (at::abs(weight) + eps) : at::rsqrt(var + eps); + + auto num = count(z); + auto dx = (dz - broadcast_to(edz / num, dz) - y * broadcast_to(eydz / num, dz)) * broadcast_to(mul, dz); + return dx; +} + +void leaky_relu_backward_cpu(at::Tensor z, at::Tensor dz, float slope) { + CHECK_CPU_INPUT(z); + CHECK_CPU_INPUT(dz); + + AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cpu", ([&] { + int64_t count = z.numel(); + auto *_z = z.data(); + auto *_dz = dz.data(); + + for (int64_t i = 0; i < count; ++i) { + if (_z[i] < 0) { + _z[i] *= 1 / slope; + _dz[i] *= slope; + } + } + })); +} + +void elu_backward_cpu(at::Tensor z, at::Tensor dz) { + CHECK_CPU_INPUT(z); + CHECK_CPU_INPUT(dz); + + AT_DISPATCH_FLOATING_TYPES(z.type(), "elu_backward_cpu", ([&] { + int64_t count = z.numel(); + auto *_z = z.data(); + auto *_dz = dz.data(); + + for (int64_t i = 0; i < count; ++i) { + if (_z[i] < 0) { + _z[i] = log1p(_z[i]); + _dz[i] *= (_z[i] + 1.f); + } + } + })); +} diff --git a/Leffa/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu b/Leffa/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..b157b06d47173d1645c6a40c89f564b737e84d43 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/src/inplace_abn_cuda.cu @@ -0,0 +1,333 @@ +#include + +#include +#include + +#include + +#include "utils/checks.h" +#include "utils/cuda.cuh" +#include "inplace_abn.h" + +#include + +// Operations for reduce +template +struct SumOp { + __device__ SumOp(const T *t, int c, int s) + : tensor(t), chn(c), sp(s) {} + __device__ __forceinline__ T operator()(int batch, int plane, int n) { + return tensor[(batch * chn + plane) * sp + n]; + } + const T *tensor; + const int chn; + const int sp; +}; + +template +struct VarOp { + __device__ VarOp(T m, const T *t, int c, int s) + : mean(m), tensor(t), chn(c), sp(s) {} + __device__ __forceinline__ T operator()(int batch, int plane, int n) { + T val = tensor[(batch * chn + plane) * sp + n]; + return (val - mean) * (val - mean); + } + const T mean; + const T *tensor; + const int chn; + const int sp; +}; + +template +struct GradOp { + __device__ GradOp(T _weight, T _bias, const T *_z, const T *_dz, int c, int s) + : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {} + __device__ __forceinline__ Pair operator()(int batch, int plane, int n) { + T _y = (z[(batch * chn + plane) * sp + n] - bias) / weight; + T _dz = dz[(batch * chn + plane) * sp + n]; + return Pair(_dz, _y * _dz); + } + const T weight; + const T bias; + const T *z; + const T *dz; + const int chn; + const int sp; +}; + +/*********** + * mean_var + ***********/ + +template +__global__ void mean_var_kernel(const T *x, T *mean, T *var, int num, int chn, int sp) { + int plane = blockIdx.x; + T norm = T(1) / T(num * sp); + + T _mean = reduce>(SumOp(x, chn, sp), plane, num, sp) * norm; + __syncthreads(); + T _var = reduce>(VarOp(_mean, x, chn, sp), plane, num, sp) * norm; + + if (threadIdx.x == 0) { + mean[plane] = _mean; + var[plane] = _var; + } +} + +std::vector mean_var_cuda(at::Tensor x) { + CHECK_CUDA_INPUT(x); + + // Extract dimensions + int64_t num, chn, sp; + get_dims(x, num, chn, sp); + + // Prepare output tensors + auto mean = at::empty({chn}, x.options()); + auto var = at::empty({chn}, x.options()); + + // Run kernel + dim3 blocks(chn); + dim3 threads(getNumThreads(sp)); + auto stream = at::cuda::getCurrentCUDAStream(); + AT_DISPATCH_FLOATING_TYPES(x.type(), "mean_var_cuda", ([&] { + mean_var_kernel<<>>( + x.data(), + mean.data(), + var.data(), + num, chn, sp); + })); + + return {mean, var}; +} + +/********** + * forward + **********/ + +template +__global__ void forward_kernel(T *x, const T *mean, const T *var, const T *weight, const T *bias, + bool affine, float eps, int num, int chn, int sp) { + int plane = blockIdx.x; + + T _mean = mean[plane]; + T _var = var[plane]; + T _weight = affine ? abs(weight[plane]) + eps : T(1); + T _bias = affine ? bias[plane] : T(0); + + T mul = rsqrt(_var + eps) * _weight; + + for (int batch = 0; batch < num; ++batch) { + for (int n = threadIdx.x; n < sp; n += blockDim.x) { + T _x = x[(batch * chn + plane) * sp + n]; + T _y = (_x - _mean) * mul + _bias; + + x[(batch * chn + plane) * sp + n] = _y; + } + } +} + +at::Tensor forward_cuda(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias, + bool affine, float eps) { + CHECK_CUDA_INPUT(x); + CHECK_CUDA_INPUT(mean); + CHECK_CUDA_INPUT(var); + CHECK_CUDA_INPUT(weight); + CHECK_CUDA_INPUT(bias); + + // Extract dimensions + int64_t num, chn, sp; + get_dims(x, num, chn, sp); + + // Run kernel + dim3 blocks(chn); + dim3 threads(getNumThreads(sp)); + auto stream = at::cuda::getCurrentCUDAStream(); + AT_DISPATCH_FLOATING_TYPES(x.type(), "forward_cuda", ([&] { + forward_kernel<<>>( + x.data(), + mean.data(), + var.data(), + weight.data(), + bias.data(), + affine, eps, num, chn, sp); + })); + + return x; +} + +/*********** + * edz_eydz + ***********/ + +template +__global__ void edz_eydz_kernel(const T *z, const T *dz, const T *weight, const T *bias, + T *edz, T *eydz, bool affine, float eps, int num, int chn, int sp) { + int plane = blockIdx.x; + + T _weight = affine ? abs(weight[plane]) + eps : 1.f; + T _bias = affine ? bias[plane] : 0.f; + + Pair res = reduce, GradOp>(GradOp(_weight, _bias, z, dz, chn, sp), plane, num, sp); + __syncthreads(); + + if (threadIdx.x == 0) { + edz[plane] = res.v1; + eydz[plane] = res.v2; + } +} + +std::vector edz_eydz_cuda(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias, + bool affine, float eps) { + CHECK_CUDA_INPUT(z); + CHECK_CUDA_INPUT(dz); + CHECK_CUDA_INPUT(weight); + CHECK_CUDA_INPUT(bias); + + // Extract dimensions + int64_t num, chn, sp; + get_dims(z, num, chn, sp); + + auto edz = at::empty({chn}, z.options()); + auto eydz = at::empty({chn}, z.options()); + + // Run kernel + dim3 blocks(chn); + dim3 threads(getNumThreads(sp)); + auto stream = at::cuda::getCurrentCUDAStream(); + AT_DISPATCH_FLOATING_TYPES(z.type(), "edz_eydz_cuda", ([&] { + edz_eydz_kernel<<>>( + z.data(), + dz.data(), + weight.data(), + bias.data(), + edz.data(), + eydz.data(), + affine, eps, num, chn, sp); + })); + + return {edz, eydz}; +} + +/*********** + * backward + ***********/ + +template +__global__ void backward_kernel(const T *z, const T *dz, const T *var, const T *weight, const T *bias, const T *edz, + const T *eydz, T *dx, bool affine, float eps, int num, int chn, int sp) { + int plane = blockIdx.x; + + T _weight = affine ? abs(weight[plane]) + eps : 1.f; + T _bias = affine ? bias[plane] : 0.f; + T _var = var[plane]; + T _edz = edz[plane]; + T _eydz = eydz[plane]; + + T _mul = _weight * rsqrt(_var + eps); + T count = T(num * sp); + + for (int batch = 0; batch < num; ++batch) { + for (int n = threadIdx.x; n < sp; n += blockDim.x) { + T _dz = dz[(batch * chn + plane) * sp + n]; + T _y = (z[(batch * chn + plane) * sp + n] - _bias) / _weight; + + dx[(batch * chn + plane) * sp + n] = (_dz - _edz / count - _y * _eydz / count) * _mul; + } + } +} + +at::Tensor backward_cuda(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias, + at::Tensor edz, at::Tensor eydz, bool affine, float eps) { + CHECK_CUDA_INPUT(z); + CHECK_CUDA_INPUT(dz); + CHECK_CUDA_INPUT(var); + CHECK_CUDA_INPUT(weight); + CHECK_CUDA_INPUT(bias); + CHECK_CUDA_INPUT(edz); + CHECK_CUDA_INPUT(eydz); + + // Extract dimensions + int64_t num, chn, sp; + get_dims(z, num, chn, sp); + + auto dx = at::zeros_like(z); + + // Run kernel + dim3 blocks(chn); + dim3 threads(getNumThreads(sp)); + auto stream = at::cuda::getCurrentCUDAStream(); + AT_DISPATCH_FLOATING_TYPES(z.type(), "backward_cuda", ([&] { + backward_kernel<<>>( + z.data(), + dz.data(), + var.data(), + weight.data(), + bias.data(), + edz.data(), + eydz.data(), + dx.data(), + affine, eps, num, chn, sp); + })); + + return dx; +} + +/************** + * activations + **************/ + +template +inline void leaky_relu_backward_impl(T *z, T *dz, float slope, int64_t count) { + // Create thrust pointers + thrust::device_ptr th_z = thrust::device_pointer_cast(z); + thrust::device_ptr th_dz = thrust::device_pointer_cast(dz); + + auto stream = at::cuda::getCurrentCUDAStream(); + thrust::transform_if(thrust::cuda::par.on(stream), + th_dz, th_dz + count, th_z, th_dz, + [slope] __device__ (const T& dz) { return dz * slope; }, + [] __device__ (const T& z) { return z < 0; }); + thrust::transform_if(thrust::cuda::par.on(stream), + th_z, th_z + count, th_z, + [slope] __device__ (const T& z) { return z / slope; }, + [] __device__ (const T& z) { return z < 0; }); +} + +void leaky_relu_backward_cuda(at::Tensor z, at::Tensor dz, float slope) { + CHECK_CUDA_INPUT(z); + CHECK_CUDA_INPUT(dz); + + int64_t count = z.numel(); + + AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] { + leaky_relu_backward_impl(z.data(), dz.data(), slope, count); + })); +} + +template +inline void elu_backward_impl(T *z, T *dz, int64_t count) { + // Create thrust pointers + thrust::device_ptr th_z = thrust::device_pointer_cast(z); + thrust::device_ptr th_dz = thrust::device_pointer_cast(dz); + + auto stream = at::cuda::getCurrentCUDAStream(); + thrust::transform_if(thrust::cuda::par.on(stream), + th_dz, th_dz + count, th_z, th_z, th_dz, + [] __device__ (const T& dz, const T& z) { return dz * (z + 1.); }, + [] __device__ (const T& z) { return z < 0; }); + thrust::transform_if(thrust::cuda::par.on(stream), + th_z, th_z + count, th_z, + [] __device__ (const T& z) { return log1p(z); }, + [] __device__ (const T& z) { return z < 0; }); +} + +void elu_backward_cuda(at::Tensor z, at::Tensor dz) { + CHECK_CUDA_INPUT(z); + CHECK_CUDA_INPUT(dz); + + int64_t count = z.numel(); + + AT_DISPATCH_FLOATING_TYPES(z.type(), "leaky_relu_backward_cuda", ([&] { + elu_backward_impl(z.data(), dz.data(), count); + })); +} diff --git a/Leffa/preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu b/Leffa/preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu new file mode 100644 index 0000000000000000000000000000000000000000..bb63e73f9d90179e5bd5dae5579c4844da9c25e2 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/src/inplace_abn_cuda_half.cu @@ -0,0 +1,275 @@ +#include + +#include + +#include + +#include "utils/checks.h" +#include "utils/cuda.cuh" +#include "inplace_abn.h" + +#include + +// Operations for reduce +struct SumOpH { + __device__ SumOpH(const half *t, int c, int s) + : tensor(t), chn(c), sp(s) {} + __device__ __forceinline__ float operator()(int batch, int plane, int n) { + return __half2float(tensor[(batch * chn + plane) * sp + n]); + } + const half *tensor; + const int chn; + const int sp; +}; + +struct VarOpH { + __device__ VarOpH(float m, const half *t, int c, int s) + : mean(m), tensor(t), chn(c), sp(s) {} + __device__ __forceinline__ float operator()(int batch, int plane, int n) { + const auto t = __half2float(tensor[(batch * chn + plane) * sp + n]); + return (t - mean) * (t - mean); + } + const float mean; + const half *tensor; + const int chn; + const int sp; +}; + +struct GradOpH { + __device__ GradOpH(float _weight, float _bias, const half *_z, const half *_dz, int c, int s) + : weight(_weight), bias(_bias), z(_z), dz(_dz), chn(c), sp(s) {} + __device__ __forceinline__ Pair operator()(int batch, int plane, int n) { + float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - bias) / weight; + float _dz = __half2float(dz[(batch * chn + plane) * sp + n]); + return Pair(_dz, _y * _dz); + } + const float weight; + const float bias; + const half *z; + const half *dz; + const int chn; + const int sp; +}; + +/*********** + * mean_var + ***********/ + +__global__ void mean_var_kernel_h(const half *x, float *mean, float *var, int num, int chn, int sp) { + int plane = blockIdx.x; + float norm = 1.f / static_cast(num * sp); + + float _mean = reduce(SumOpH(x, chn, sp), plane, num, sp) * norm; + __syncthreads(); + float _var = reduce(VarOpH(_mean, x, chn, sp), plane, num, sp) * norm; + + if (threadIdx.x == 0) { + mean[plane] = _mean; + var[plane] = _var; + } +} + +std::vector mean_var_cuda_h(at::Tensor x) { + CHECK_CUDA_INPUT(x); + + // Extract dimensions + int64_t num, chn, sp; + get_dims(x, num, chn, sp); + + // Prepare output tensors + auto mean = at::empty({chn},x.options().dtype(at::kFloat)); + auto var = at::empty({chn},x.options().dtype(at::kFloat)); + + // Run kernel + dim3 blocks(chn); + dim3 threads(getNumThreads(sp)); + auto stream = at::cuda::getCurrentCUDAStream(); + mean_var_kernel_h<<>>( + reinterpret_cast(x.data()), + mean.data(), + var.data(), + num, chn, sp); + + return {mean, var}; +} + +/********** + * forward + **********/ + +__global__ void forward_kernel_h(half *x, const float *mean, const float *var, const float *weight, const float *bias, + bool affine, float eps, int num, int chn, int sp) { + int plane = blockIdx.x; + + const float _mean = mean[plane]; + const float _var = var[plane]; + const float _weight = affine ? abs(weight[plane]) + eps : 1.f; + const float _bias = affine ? bias[plane] : 0.f; + + const float mul = rsqrt(_var + eps) * _weight; + + for (int batch = 0; batch < num; ++batch) { + for (int n = threadIdx.x; n < sp; n += blockDim.x) { + half *x_ptr = x + (batch * chn + plane) * sp + n; + float _x = __half2float(*x_ptr); + float _y = (_x - _mean) * mul + _bias; + + *x_ptr = __float2half(_y); + } + } +} + +at::Tensor forward_cuda_h(at::Tensor x, at::Tensor mean, at::Tensor var, at::Tensor weight, at::Tensor bias, + bool affine, float eps) { + CHECK_CUDA_INPUT(x); + CHECK_CUDA_INPUT(mean); + CHECK_CUDA_INPUT(var); + CHECK_CUDA_INPUT(weight); + CHECK_CUDA_INPUT(bias); + + // Extract dimensions + int64_t num, chn, sp; + get_dims(x, num, chn, sp); + + // Run kernel + dim3 blocks(chn); + dim3 threads(getNumThreads(sp)); + auto stream = at::cuda::getCurrentCUDAStream(); + forward_kernel_h<<>>( + reinterpret_cast(x.data()), + mean.data(), + var.data(), + weight.data(), + bias.data(), + affine, eps, num, chn, sp); + + return x; +} + +__global__ void edz_eydz_kernel_h(const half *z, const half *dz, const float *weight, const float *bias, + float *edz, float *eydz, bool affine, float eps, int num, int chn, int sp) { + int plane = blockIdx.x; + + float _weight = affine ? abs(weight[plane]) + eps : 1.f; + float _bias = affine ? bias[plane] : 0.f; + + Pair res = reduce, GradOpH>(GradOpH(_weight, _bias, z, dz, chn, sp), plane, num, sp); + __syncthreads(); + + if (threadIdx.x == 0) { + edz[plane] = res.v1; + eydz[plane] = res.v2; + } +} + +std::vector edz_eydz_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor weight, at::Tensor bias, + bool affine, float eps) { + CHECK_CUDA_INPUT(z); + CHECK_CUDA_INPUT(dz); + CHECK_CUDA_INPUT(weight); + CHECK_CUDA_INPUT(bias); + + // Extract dimensions + int64_t num, chn, sp; + get_dims(z, num, chn, sp); + + auto edz = at::empty({chn},z.options().dtype(at::kFloat)); + auto eydz = at::empty({chn},z.options().dtype(at::kFloat)); + + // Run kernel + dim3 blocks(chn); + dim3 threads(getNumThreads(sp)); + auto stream = at::cuda::getCurrentCUDAStream(); + edz_eydz_kernel_h<<>>( + reinterpret_cast(z.data()), + reinterpret_cast(dz.data()), + weight.data(), + bias.data(), + edz.data(), + eydz.data(), + affine, eps, num, chn, sp); + + return {edz, eydz}; +} + +__global__ void backward_kernel_h(const half *z, const half *dz, const float *var, const float *weight, const float *bias, const float *edz, + const float *eydz, half *dx, bool affine, float eps, int num, int chn, int sp) { + int plane = blockIdx.x; + + float _weight = affine ? abs(weight[plane]) + eps : 1.f; + float _bias = affine ? bias[plane] : 0.f; + float _var = var[plane]; + float _edz = edz[plane]; + float _eydz = eydz[plane]; + + float _mul = _weight * rsqrt(_var + eps); + float count = float(num * sp); + + for (int batch = 0; batch < num; ++batch) { + for (int n = threadIdx.x; n < sp; n += blockDim.x) { + float _dz = __half2float(dz[(batch * chn + plane) * sp + n]); + float _y = (__half2float(z[(batch * chn + plane) * sp + n]) - _bias) / _weight; + + dx[(batch * chn + plane) * sp + n] = __float2half((_dz - _edz / count - _y * _eydz / count) * _mul); + } + } +} + +at::Tensor backward_cuda_h(at::Tensor z, at::Tensor dz, at::Tensor var, at::Tensor weight, at::Tensor bias, + at::Tensor edz, at::Tensor eydz, bool affine, float eps) { + CHECK_CUDA_INPUT(z); + CHECK_CUDA_INPUT(dz); + CHECK_CUDA_INPUT(var); + CHECK_CUDA_INPUT(weight); + CHECK_CUDA_INPUT(bias); + CHECK_CUDA_INPUT(edz); + CHECK_CUDA_INPUT(eydz); + + // Extract dimensions + int64_t num, chn, sp; + get_dims(z, num, chn, sp); + + auto dx = at::zeros_like(z); + + // Run kernel + dim3 blocks(chn); + dim3 threads(getNumThreads(sp)); + auto stream = at::cuda::getCurrentCUDAStream(); + backward_kernel_h<<>>( + reinterpret_cast(z.data()), + reinterpret_cast(dz.data()), + var.data(), + weight.data(), + bias.data(), + edz.data(), + eydz.data(), + reinterpret_cast(dx.data()), + affine, eps, num, chn, sp); + + return dx; +} + +__global__ void leaky_relu_backward_impl_h(half *z, half *dz, float slope, int64_t count) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x){ + float _z = __half2float(z[i]); + if (_z < 0) { + dz[i] = __float2half(__half2float(dz[i]) * slope); + z[i] = __float2half(_z / slope); + } + } +} + +void leaky_relu_backward_cuda_h(at::Tensor z, at::Tensor dz, float slope) { + CHECK_CUDA_INPUT(z); + CHECK_CUDA_INPUT(dz); + + int64_t count = z.numel(); + dim3 threads(getNumThreads(count)); + dim3 blocks = (count + threads.x - 1) / threads.x; + auto stream = at::cuda::getCurrentCUDAStream(); + leaky_relu_backward_impl_h<<>>( + reinterpret_cast(z.data()), + reinterpret_cast(dz.data()), + slope, count); +} + diff --git a/Leffa/preprocess/humanparsing/modules/src/utils/checks.h b/Leffa/preprocess/humanparsing/modules/src/utils/checks.h new file mode 100644 index 0000000000000000000000000000000000000000..e761a6fe34d0789815b588eba7e3726026e0e868 --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/src/utils/checks.h @@ -0,0 +1,15 @@ +#pragma once + +#include + +// Define AT_CHECK for old version of ATen where the same function was called AT_ASSERT +#ifndef AT_CHECK +#define AT_CHECK AT_ASSERT +#endif + +#define CHECK_CUDA(x) AT_CHECK((x).type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CPU(x) AT_CHECK(!(x).type().is_cuda(), #x " must be a CPU tensor") +#define CHECK_CONTIGUOUS(x) AT_CHECK((x).is_contiguous(), #x " must be contiguous") + +#define CHECK_CUDA_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) +#define CHECK_CPU_INPUT(x) CHECK_CPU(x); CHECK_CONTIGUOUS(x) \ No newline at end of file diff --git a/Leffa/preprocess/humanparsing/modules/src/utils/common.h b/Leffa/preprocess/humanparsing/modules/src/utils/common.h new file mode 100644 index 0000000000000000000000000000000000000000..e8403eef8a233b75dd4bb353c16486fe1be2039a --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/src/utils/common.h @@ -0,0 +1,49 @@ +#pragma once + +#include + +/* + * Functions to share code between CPU and GPU + */ + +#ifdef __CUDACC__ +// CUDA versions + +#define HOST_DEVICE __host__ __device__ +#define INLINE_HOST_DEVICE __host__ __device__ inline +#define FLOOR(x) floor(x) + +#if __CUDA_ARCH__ >= 600 +// Recent compute capabilities have block-level atomicAdd for all data types, so we use that +#define ACCUM(x,y) atomicAdd_block(&(x),(y)) +#else +// Older architectures don't have block-level atomicAdd, nor atomicAdd for doubles, so we defer to atomicAdd for float +// and use the known atomicCAS-based implementation for double +template +__device__ inline data_t atomic_add(data_t *address, data_t val) { + return atomicAdd(address, val); +} + +template<> +__device__ inline double atomic_add(double *address, double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +#define ACCUM(x,y) atomic_add(&(x),(y)) +#endif // #if __CUDA_ARCH__ >= 600 + +#else +// CPU versions + +#define HOST_DEVICE +#define INLINE_HOST_DEVICE inline +#define FLOOR(x) std::floor(x) +#define ACCUM(x,y) (x) += (y) + +#endif // #ifdef __CUDACC__ \ No newline at end of file diff --git a/Leffa/preprocess/humanparsing/modules/src/utils/cuda.cuh b/Leffa/preprocess/humanparsing/modules/src/utils/cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..60c0023835e02c5f7c539c28ac07b75b72df394b --- /dev/null +++ b/Leffa/preprocess/humanparsing/modules/src/utils/cuda.cuh @@ -0,0 +1,71 @@ +#pragma once + +/* + * General settings and functions + */ +const int WARP_SIZE = 32; +const int MAX_BLOCK_SIZE = 1024; + +static int getNumThreads(int nElem) { + int threadSizes[6] = {32, 64, 128, 256, 512, MAX_BLOCK_SIZE}; + for (int i = 0; i < 6; ++i) { + if (nElem <= threadSizes[i]) { + return threadSizes[i]; + } + } + return MAX_BLOCK_SIZE; +} + +/* + * Reduction utilities + */ +template +__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, + unsigned int mask = 0xffffffff) { +#if CUDART_VERSION >= 9000 + return __shfl_xor_sync(mask, value, laneMask, width); +#else + return __shfl_xor(value, laneMask, width); +#endif +} + +__device__ __forceinline__ int getMSB(int val) { return 31 - __clz(val); } + +template +struct Pair { + T v1, v2; + __device__ Pair() {} + __device__ Pair(T _v1, T _v2) : v1(_v1), v2(_v2) {} + __device__ Pair(T v) : v1(v), v2(v) {} + __device__ Pair(int v) : v1(v), v2(v) {} + __device__ Pair &operator+=(const Pair &a) { + v1 += a.v1; + v2 += a.v2; + return *this; + } +}; + +template +static __device__ __forceinline__ T warpSum(T val) { +#if __CUDA_ARCH__ >= 300 + for (int i = 0; i < getMSB(WARP_SIZE); ++i) { + val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE); + } +#else + __shared__ T values[MAX_BLOCK_SIZE]; + values[threadIdx.x] = val; + __threadfence_block(); + const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE; + for (int i = 1; i < WARP_SIZE; i++) { + val += values[base + ((i + threadIdx.x) % WARP_SIZE)]; + } +#endif + return val; +} + +template +static __device__ __forceinline__ Pair warpSum(Pair value) { + value.v1 = warpSum(value.v1); + value.v2 = warpSum(value.v2); + return value; +} \ No newline at end of file diff --git a/Leffa/preprocess/humanparsing/networks/AugmentCE2P.py b/Leffa/preprocess/humanparsing/networks/AugmentCE2P.py new file mode 100644 index 0000000000000000000000000000000000000000..ce32f78dd0b92d943e5b1d573a33e2f69f247f23 --- /dev/null +++ b/Leffa/preprocess/humanparsing/networks/AugmentCE2P.py @@ -0,0 +1,388 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +""" +@Author : Peike Li +@Contact : peike.li@yahoo.com +@File : AugmentCE2P.py +@Time : 8/4/19 3:35 PM +@Desc : +@License : This source code is licensed under the license found in the + LICENSE file in the root directory of this source tree. +""" + +import functools +import pdb + +import torch +import torch.nn as nn +from torch.nn import functional as F +# Note here we adopt the InplaceABNSync implementation from https://github.com/mapillary/inplace_abn +# By default, the InplaceABNSync module contains a BatchNorm Layer and a LeakyReLu layer +from modules import InPlaceABNSync +import numpy as np + +BatchNorm2d = functools.partial(InPlaceABNSync, activation='none') + +affine_par = True + +pretrained_settings = { + 'resnet101': { + 'imagenet': { + 'input_space': 'BGR', + 'input_size': [3, 224, 224], + 'input_range': [0, 1], + 'mean': [0.406, 0.456, 0.485], + 'std': [0.225, 0.224, 0.229], + 'num_classes': 1000 + } + }, +} + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None, fist_dilation=1, multi_grid=1): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=dilation * multi_grid, dilation=dilation * multi_grid, bias=False) + self.bn2 = BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=False) + self.relu_inplace = nn.ReLU(inplace=True) + self.downsample = downsample + self.dilation = dilation + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out = out + residual + out = self.relu_inplace(out) + + return out + + +class CostomAdaptiveAvgPool2D(nn.Module): + + def __init__(self, output_size): + + super(CostomAdaptiveAvgPool2D, self).__init__() + + self.output_size = output_size + + def forward(self, x): + + H_in, W_in = x.shape[-2:] + H_out, W_out = self.output_size + + out_i = [] + for i in range(H_out): + out_j = [] + for j in range(W_out): + hs = int(np.floor(i * H_in / H_out)) + he = int(np.ceil((i + 1) * H_in / H_out)) + + ws = int(np.floor(j * W_in / W_out)) + we = int(np.ceil((j + 1) * W_in / W_out)) + + # print(hs, he, ws, we) + kernel_size = [he - hs, we - ws] + + out = F.avg_pool2d(x[:, :, hs:he, ws:we], kernel_size) + out_j.append(out) + + out_j = torch.concat(out_j, -1) + out_i.append(out_j) + + out_i = torch.concat(out_i, -2) + return out_i + + +class PSPModule(nn.Module): + """ + Reference: + Zhao, Hengshuang, et al. *"Pyramid scene parsing network."* + """ + + def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)): + super(PSPModule, self).__init__() + + self.stages = [] + tmp = [] + for size in sizes: + if size == 3 or size == 6: + tmp.append(self._make_stage_custom(features, out_features, size)) + else: + tmp.append(self._make_stage(features, out_features, size)) + self.stages = nn.ModuleList(tmp) + # self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes]) + self.bottleneck = nn.Sequential( + nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1, + bias=False), + InPlaceABNSync(out_features), + ) + + def _make_stage(self, features, out_features, size): + prior = nn.AdaptiveAvgPool2d(output_size=(size, size)) + conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False) + bn = InPlaceABNSync(out_features) + return nn.Sequential(prior, conv, bn) + + def _make_stage_custom(self, features, out_features, size): + prior = CostomAdaptiveAvgPool2D(output_size=(size, size)) + conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False) + bn = InPlaceABNSync(out_features) + return nn.Sequential(prior, conv, bn) + + def forward(self, feats): + h, w = feats.size(2), feats.size(3) + priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in + self.stages] + [feats] + bottle = self.bottleneck(torch.cat(priors, 1)) + return bottle + + +class ASPPModule(nn.Module): + """ + Reference: + Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."* + """ + + def __init__(self, features, inner_features=256, out_features=512, dilations=(12, 24, 36)): + super(ASPPModule, self).__init__() + + self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)), + nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, + bias=False), + InPlaceABNSync(inner_features)) + self.conv2 = nn.Sequential( + nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(inner_features)) + self.conv3 = nn.Sequential( + nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False), + InPlaceABNSync(inner_features)) + self.conv4 = nn.Sequential( + nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False), + InPlaceABNSync(inner_features)) + self.conv5 = nn.Sequential( + nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False), + InPlaceABNSync(inner_features)) + + self.bottleneck = nn.Sequential( + nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(out_features), + nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) + + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1) + + bottle = self.bottleneck(out) + return bottle + + +class Edge_Module(nn.Module): + """ + Edge Learning Branch + """ + + def __init__(self, in_fea=[256, 512, 1024], mid_fea=256, out_fea=2): + super(Edge_Module, self).__init__() + + self.conv1 = nn.Sequential( + nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(mid_fea) + ) + self.conv2 = nn.Sequential( + nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(mid_fea) + ) + self.conv3 = nn.Sequential( + nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(mid_fea) + ) + self.conv4 = nn.Conv2d(mid_fea, out_fea, kernel_size=3, padding=1, dilation=1, bias=True) + self.conv5 = nn.Conv2d(out_fea * 3, out_fea, kernel_size=1, padding=0, dilation=1, bias=True) + + def forward(self, x1, x2, x3): + _, _, h, w = x1.size() + + edge1_fea = self.conv1(x1) + edge1 = self.conv4(edge1_fea) + edge2_fea = self.conv2(x2) + edge2 = self.conv4(edge2_fea) + edge3_fea = self.conv3(x3) + edge3 = self.conv4(edge3_fea) + + edge2_fea = F.interpolate(edge2_fea, size=(h, w), mode='bilinear', align_corners=True) + edge3_fea = F.interpolate(edge3_fea, size=(h, w), mode='bilinear', align_corners=True) + edge2 = F.interpolate(edge2, size=(h, w), mode='bilinear', align_corners=True) + edge3 = F.interpolate(edge3, size=(h, w), mode='bilinear', align_corners=True) + + edge = torch.cat([edge1, edge2, edge3], dim=1) + edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1) + edge = self.conv5(edge) + + return edge, edge_fea + + +class Decoder_Module(nn.Module): + """ + Parsing Branch Decoder Module. + """ + + def __init__(self, num_classes): + super(Decoder_Module, self).__init__() + self.conv1 = nn.Sequential( + nn.Conv2d(512, 256, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(256) + ) + self.conv2 = nn.Sequential( + nn.Conv2d(256, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(48) + ) + self.conv3 = nn.Sequential( + nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(256), + nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(256) + ) + + self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True) + + def forward(self, xt, xl): + _, _, h, w = xl.size() + xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True) + xl = self.conv2(xl) + x = torch.cat([xt, xl], dim=1) + x = self.conv3(x) + seg = self.conv4(x) + return seg, x + + +class ResNet(nn.Module): + def __init__(self, block, layers, num_classes): + self.inplanes = 128 + super(ResNet, self).__init__() + self.conv1 = conv3x3(3, 64, stride=2) + self.bn1 = BatchNorm2d(64) + self.relu1 = nn.ReLU(inplace=False) + self.conv2 = conv3x3(64, 64) + self.bn2 = BatchNorm2d(64) + self.relu2 = nn.ReLU(inplace=False) + self.conv3 = conv3x3(64, 128) + self.bn3 = BatchNorm2d(128) + self.relu3 = nn.ReLU(inplace=False) + + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=2, multi_grid=(1, 1, 1)) + + self.context_encoding = PSPModule(2048, 512) + + self.edge = Edge_Module() + self.decoder = Decoder_Module(num_classes) + + self.fushion = nn.Sequential( + nn.Conv2d(1024, 256, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(256), + nn.Dropout2d(0.1), + nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True) + ) + + def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + BatchNorm2d(planes * block.expansion, affine=affine_par)) + + layers = [] + generate_multi_grid = lambda index, grids: grids[index % len(grids)] if isinstance(grids, tuple) else 1 + layers.append(block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample, + multi_grid=generate_multi_grid(0, multi_grid))) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block(self.inplanes, planes, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid))) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.relu1(self.bn1(self.conv1(x))) + x = self.relu2(self.bn2(self.conv2(x))) + x = self.relu3(self.bn3(self.conv3(x))) + x = self.maxpool(x) + x2 = self.layer1(x) + x3 = self.layer2(x2) + x4 = self.layer3(x3) + x5 = self.layer4(x4) + x = self.context_encoding(x5) + parsing_result, parsing_fea = self.decoder(x, x2) + # Edge Branch + edge_result, edge_fea = self.edge(x2, x3, x4) + # Fusion Branch + x = torch.cat([parsing_fea, edge_fea], dim=1) + fusion_result = self.fushion(x) + return [[parsing_result, fusion_result], edge_result] + + +def initialize_pretrained_model(model, settings, pretrained='./models/resnet101-imagenet.pth'): + model.input_space = settings['input_space'] + model.input_size = settings['input_size'] + model.input_range = settings['input_range'] + model.mean = settings['mean'] + model.std = settings['std'] + + if pretrained is not None: + saved_state_dict = torch.load(pretrained) + new_params = model.state_dict().copy() + for i in saved_state_dict: + i_parts = i.split('.') + if not i_parts[0] == 'fc': + new_params['.'.join(i_parts[0:])] = saved_state_dict[i] + model.load_state_dict(new_params) + + +def resnet101(num_classes=20, pretrained='./models/resnet101-imagenet.pth'): + model = ResNet(Bottleneck, [3, 4, 23, 3], num_classes) + settings = pretrained_settings['resnet101']['imagenet'] + initialize_pretrained_model(model, settings, pretrained) + return model diff --git a/Leffa/preprocess/humanparsing/networks/__init__.py b/Leffa/preprocess/humanparsing/networks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d5d384890e20652fa3ec282515ece6846ce447f --- /dev/null +++ b/Leffa/preprocess/humanparsing/networks/__init__.py @@ -0,0 +1,12 @@ +from __future__ import absolute_import +from networks.AugmentCE2P import resnet101 + +__factory = { + 'resnet101': resnet101, +} + + +def init_model(name, *args, **kwargs): + if name not in __factory.keys(): + raise KeyError("Unknown model arch: {}".format(name)) + return __factory[name](*args, **kwargs) \ No newline at end of file diff --git a/Leffa/preprocess/humanparsing/networks/backbone/mobilenetv2.py b/Leffa/preprocess/humanparsing/networks/backbone/mobilenetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..6f2fe342877cfbc5796efea85af9abccfb80a27e --- /dev/null +++ b/Leffa/preprocess/humanparsing/networks/backbone/mobilenetv2.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +""" +@Author : Peike Li +@Contact : peike.li@yahoo.com +@File : mobilenetv2.py +@Time : 8/4/19 3:35 PM +@Desc : +@License : This source code is licensed under the license found in the + LICENSE file in the root directory of this source tree. +""" + +import torch.nn as nn +import math +import functools + +from modules import InPlaceABN, InPlaceABNSync + +BatchNorm2d = functools.partial(InPlaceABNSync, activation='none') + +__all__ = ['mobilenetv2'] + + +def conv_bn(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + BatchNorm2d(oup), + nn.ReLU6(inplace=True) + ) + + +def conv_1x1_bn(inp, oup): + return nn.Sequential( + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + BatchNorm2d(oup), + nn.ReLU6(inplace=True) + ) + + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, expand_ratio): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + hidden_dim = round(inp * expand_ratio) + self.use_res_connect = self.stride == 1 and inp == oup + + if expand_ratio == 1: + self.conv = nn.Sequential( + # dw + nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), + BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + BatchNorm2d(oup), + ) + else: + self.conv = nn.Sequential( + # pw + nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), + BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # dw + nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), + BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + BatchNorm2d(oup), + ) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +class MobileNetV2(nn.Module): + def __init__(self, n_class=1000, input_size=224, width_mult=1.): + super(MobileNetV2, self).__init__() + block = InvertedResidual + input_channel = 32 + last_channel = 1280 + interverted_residual_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [6, 24, 2, 2], # layer 2 + [6, 32, 3, 2], # layer 3 + [6, 64, 4, 2], + [6, 96, 3, 1], # layer 4 + [6, 160, 3, 2], + [6, 320, 1, 1], # layer 5 + ] + + # building first layer + assert input_size % 32 == 0 + input_channel = int(input_channel * width_mult) + self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel + self.features = [conv_bn(3, input_channel, 2)] + # building inverted residual blocks + for t, c, n, s in interverted_residual_setting: + output_channel = int(c * width_mult) + for i in range(n): + if i == 0: + self.features.append(block(input_channel, output_channel, s, expand_ratio=t)) + else: + self.features.append(block(input_channel, output_channel, 1, expand_ratio=t)) + input_channel = output_channel + # building last several layers + self.features.append(conv_1x1_bn(input_channel, self.last_channel)) + # make it nn.Sequential + self.features = nn.Sequential(*self.features) + + # building classifier + self.classifier = nn.Sequential( + nn.Dropout(0.2), + nn.Linear(self.last_channel, n_class), + ) + + self._initialize_weights() + + def forward(self, x): + x = self.features(x) + x = x.mean(3).mean(2) + x = self.classifier(x) + return x + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() + + +def mobilenetv2(pretrained=False, **kwargs): + """Constructs a MobileNet_V2 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = MobileNetV2(n_class=1000, **kwargs) + if pretrained: + model.load_state_dict(load_url(model_urls['mobilenetv2']), strict=False) + return model diff --git a/Leffa/preprocess/humanparsing/networks/backbone/resnet.py b/Leffa/preprocess/humanparsing/networks/backbone/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..88d6f73bc4fc327e18123020e01ccf5c1b37f025 --- /dev/null +++ b/Leffa/preprocess/humanparsing/networks/backbone/resnet.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +""" +@Author : Peike Li +@Contact : peike.li@yahoo.com +@File : resnet.py +@Time : 8/4/19 3:35 PM +@Desc : +@License : This source code is licensed under the license found in the + LICENSE file in the root directory of this source tree. +""" + +import functools +import torch.nn as nn +import math +from torch.utils.model_zoo import load_url + +from modules import InPlaceABNSync + +BatchNorm2d = functools.partial(InPlaceABNSync, activation='none') + +__all__ = ['ResNet', 'resnet18', 'resnet50', 'resnet101'] # resnet101 is coming soon! + +model_urls = { + 'resnet18': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet18-imagenet.pth', + 'resnet50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet50-imagenet.pth', + 'resnet101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnet101-imagenet.pth' +} + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + + def __init__(self, block, layers, num_classes=1000): + self.inplanes = 128 + super(ResNet, self).__init__() + self.conv1 = conv3x3(3, 64, stride=2) + self.bn1 = BatchNorm2d(64) + self.relu1 = nn.ReLU(inplace=True) + self.conv2 = conv3x3(64, 64) + self.bn2 = BatchNorm2d(64) + self.relu2 = nn.ReLU(inplace=True) + self.conv3 = conv3x3(64, 128) + self.bn3 = BatchNorm2d(128) + self.relu3 = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + self.avgpool = nn.AvgPool2d(7, stride=1) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.relu1(self.bn1(self.conv1(x))) + x = self.relu2(self.bn2(self.conv2(x))) + x = self.relu3(self.bn3(self.conv3(x))) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + + return x + + +def resnet18(pretrained=False, **kwargs): + """Constructs a ResNet-18 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) + if pretrained: + model.load_state_dict(load_url(model_urls['resnet18'])) + return model + + +def resnet50(pretrained=False, **kwargs): + """Constructs a ResNet-50 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + if pretrained: + model.load_state_dict(load_url(model_urls['resnet50']), strict=False) + return model + + +def resnet101(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) + if pretrained: + model.load_state_dict(load_url(model_urls['resnet101']), strict=False) + return model diff --git a/Leffa/preprocess/humanparsing/networks/backbone/resnext.py b/Leffa/preprocess/humanparsing/networks/backbone/resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..96adb54146addc523be71591eb93afcc2c25307f --- /dev/null +++ b/Leffa/preprocess/humanparsing/networks/backbone/resnext.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +""" +@Author : Peike Li +@Contact : peike.li@yahoo.com +@File : resnext.py.py +@Time : 8/11/19 8:58 PM +@Desc : +@License : This source code is licensed under the license found in the + LICENSE file in the root directory of this source tree. +""" +import functools +import torch.nn as nn +import math +from torch.utils.model_zoo import load_url + +from modules import InPlaceABNSync + +BatchNorm2d = functools.partial(InPlaceABNSync, activation='none') + +__all__ = ['ResNeXt', 'resnext101'] # support resnext 101 + +model_urls = { + 'resnext50': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext50-imagenet.pth', + 'resnext101': 'http://sceneparsing.csail.mit.edu/model/pretrained_resnet/resnext101-imagenet.pth' +} + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class GroupBottleneck(nn.Module): + expansion = 2 + + def __init__(self, inplanes, planes, stride=1, groups=1, downsample=None): + super(GroupBottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, groups=groups, bias=False) + self.bn2 = BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 2, kernel_size=1, bias=False) + self.bn3 = BatchNorm2d(planes * 2) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNeXt(nn.Module): + + def __init__(self, block, layers, groups=32, num_classes=1000): + self.inplanes = 128 + super(ResNeXt, self).__init__() + self.conv1 = conv3x3(3, 64, stride=2) + self.bn1 = BatchNorm2d(64) + self.relu1 = nn.ReLU(inplace=True) + self.conv2 = conv3x3(64, 64) + self.bn2 = BatchNorm2d(64) + self.relu2 = nn.ReLU(inplace=True) + self.conv3 = conv3x3(64, 128) + self.bn3 = BatchNorm2d(128) + self.relu3 = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.layer1 = self._make_layer(block, 128, layers[0], groups=groups) + self.layer2 = self._make_layer(block, 256, layers[1], stride=2, groups=groups) + self.layer3 = self._make_layer(block, 512, layers[2], stride=2, groups=groups) + self.layer4 = self._make_layer(block, 1024, layers[3], stride=2, groups=groups) + self.avgpool = nn.AvgPool2d(7, stride=1) + self.fc = nn.Linear(1024 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels // m.groups + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_layer(self, block, planes, blocks, stride=1, groups=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, groups, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=groups)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.relu1(self.bn1(self.conv1(x))) + x = self.relu2(self.bn2(self.conv2(x))) + x = self.relu3(self.bn3(self.conv3(x))) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + + return x + + +def resnext101(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + Args: + pretrained (bool): If True, returns a model pre-trained on Places + """ + model = ResNeXt(GroupBottleneck, [3, 4, 23, 3], **kwargs) + if pretrained: + model.load_state_dict(load_url(model_urls['resnext101']), strict=False) + return model diff --git a/Leffa/preprocess/humanparsing/networks/context_encoding/aspp.py b/Leffa/preprocess/humanparsing/networks/context_encoding/aspp.py new file mode 100644 index 0000000000000000000000000000000000000000..d0ba531a8920665c982b1f3412bc030465d56d2a --- /dev/null +++ b/Leffa/preprocess/humanparsing/networks/context_encoding/aspp.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +""" +@Author : Peike Li +@Contact : peike.li@yahoo.com +@File : aspp.py +@Time : 8/4/19 3:36 PM +@Desc : +@License : This source code is licensed under the license found in the + LICENSE file in the root directory of this source tree. +""" + +import torch +import torch.nn as nn +from torch.nn import functional as F + +from modules import InPlaceABNSync + + +class ASPPModule(nn.Module): + """ + Reference: + Chen, Liang-Chieh, et al. *"Rethinking Atrous Convolution for Semantic Image Segmentation."* + """ + def __init__(self, features, out_features=512, inner_features=256, dilations=(12, 24, 36)): + super(ASPPModule, self).__init__() + + self.conv1 = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)), + nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, + bias=False), + InPlaceABNSync(inner_features)) + self.conv2 = nn.Sequential( + nn.Conv2d(features, inner_features, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(inner_features)) + self.conv3 = nn.Sequential( + nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False), + InPlaceABNSync(inner_features)) + self.conv4 = nn.Sequential( + nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False), + InPlaceABNSync(inner_features)) + self.conv5 = nn.Sequential( + nn.Conv2d(features, inner_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False), + InPlaceABNSync(inner_features)) + + self.bottleneck = nn.Sequential( + nn.Conv2d(inner_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(out_features), + nn.Dropout2d(0.1) + ) + + def forward(self, x): + _, _, h, w = x.size() + + feat1 = F.interpolate(self.conv1(x), size=(h, w), mode='bilinear', align_corners=True) + + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1) + + bottle = self.bottleneck(out) + return bottle \ No newline at end of file diff --git a/Leffa/preprocess/humanparsing/networks/context_encoding/ocnet.py b/Leffa/preprocess/humanparsing/networks/context_encoding/ocnet.py new file mode 100644 index 0000000000000000000000000000000000000000..ac43ebf489ee478c48acf3f93b01b32bdb08cdf3 --- /dev/null +++ b/Leffa/preprocess/humanparsing/networks/context_encoding/ocnet.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +""" +@Author : Peike Li +@Contact : peike.li@yahoo.com +@File : ocnet.py +@Time : 8/4/19 3:36 PM +@Desc : +@License : This source code is licensed under the license found in the + LICENSE file in the root directory of this source tree. +""" + +import functools + +import torch +import torch.nn as nn +from torch.autograd import Variable +from torch.nn import functional as F + +from modules import InPlaceABNSync +BatchNorm2d = functools.partial(InPlaceABNSync, activation='none') + + +class _SelfAttentionBlock(nn.Module): + ''' + The basic implementation for self-attention block/non-local block + Input: + N X C X H X W + Parameters: + in_channels : the dimension of the input feature map + key_channels : the dimension after the key/query transform + value_channels : the dimension after the value transform + scale : choose the scale to downsample the input feature maps (save memory cost) + Return: + N X C X H X W + position-aware context features.(w/o concate or add with the input) + ''' + + def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1): + super(_SelfAttentionBlock, self).__init__() + self.scale = scale + self.in_channels = in_channels + self.out_channels = out_channels + self.key_channels = key_channels + self.value_channels = value_channels + if out_channels == None: + self.out_channels = in_channels + self.pool = nn.MaxPool2d(kernel_size=(scale, scale)) + self.f_key = nn.Sequential( + nn.Conv2d(in_channels=self.in_channels, out_channels=self.key_channels, + kernel_size=1, stride=1, padding=0), + InPlaceABNSync(self.key_channels), + ) + self.f_query = self.f_key + self.f_value = nn.Conv2d(in_channels=self.in_channels, out_channels=self.value_channels, + kernel_size=1, stride=1, padding=0) + self.W = nn.Conv2d(in_channels=self.value_channels, out_channels=self.out_channels, + kernel_size=1, stride=1, padding=0) + nn.init.constant(self.W.weight, 0) + nn.init.constant(self.W.bias, 0) + + def forward(self, x): + batch_size, h, w = x.size(0), x.size(2), x.size(3) + if self.scale > 1: + x = self.pool(x) + + value = self.f_value(x).view(batch_size, self.value_channels, -1) + value = value.permute(0, 2, 1) + query = self.f_query(x).view(batch_size, self.key_channels, -1) + query = query.permute(0, 2, 1) + key = self.f_key(x).view(batch_size, self.key_channels, -1) + + sim_map = torch.matmul(query, key) + sim_map = (self.key_channels ** -.5) * sim_map + sim_map = F.softmax(sim_map, dim=-1) + + context = torch.matmul(sim_map, value) + context = context.permute(0, 2, 1).contiguous() + context = context.view(batch_size, self.value_channels, *x.size()[2:]) + context = self.W(context) + if self.scale > 1: + context = F.upsample(input=context, size=(h, w), mode='bilinear', align_corners=True) + return context + + +class SelfAttentionBlock2D(_SelfAttentionBlock): + def __init__(self, in_channels, key_channels, value_channels, out_channels=None, scale=1): + super(SelfAttentionBlock2D, self).__init__(in_channels, + key_channels, + value_channels, + out_channels, + scale) + + +class BaseOC_Module(nn.Module): + """ + Implementation of the BaseOC module + Parameters: + in_features / out_features: the channels of the input / output feature maps. + dropout: we choose 0.05 as the default value. + size: you can apply multiple sizes. Here we only use one size. + Return: + features fused with Object context information. + """ + + def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])): + super(BaseOC_Module, self).__init__() + self.stages = [] + self.stages = nn.ModuleList( + [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes]) + self.conv_bn_dropout = nn.Sequential( + nn.Conv2d(2 * in_channels, out_channels, kernel_size=1, padding=0), + InPlaceABNSync(out_channels), + nn.Dropout2d(dropout) + ) + + def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size): + return SelfAttentionBlock2D(in_channels, + key_channels, + value_channels, + output_channels, + size) + + def forward(self, feats): + priors = [stage(feats) for stage in self.stages] + context = priors[0] + for i in range(1, len(priors)): + context += priors[i] + output = self.conv_bn_dropout(torch.cat([context, feats], 1)) + return output + + +class BaseOC_Context_Module(nn.Module): + """ + Output only the context features. + Parameters: + in_features / out_features: the channels of the input / output feature maps. + dropout: specify the dropout ratio + fusion: We provide two different fusion method, "concat" or "add" + size: we find that directly learn the attention weights on even 1/8 feature maps is hard. + Return: + features after "concat" or "add" + """ + + def __init__(self, in_channels, out_channels, key_channels, value_channels, dropout, sizes=([1])): + super(BaseOC_Context_Module, self).__init__() + self.stages = [] + self.stages = nn.ModuleList( + [self._make_stage(in_channels, out_channels, key_channels, value_channels, size) for size in sizes]) + self.conv_bn_dropout = nn.Sequential( + nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0), + InPlaceABNSync(out_channels), + ) + + def _make_stage(self, in_channels, output_channels, key_channels, value_channels, size): + return SelfAttentionBlock2D(in_channels, + key_channels, + value_channels, + output_channels, + size) + + def forward(self, feats): + priors = [stage(feats) for stage in self.stages] + context = priors[0] + for i in range(1, len(priors)): + context += priors[i] + output = self.conv_bn_dropout(context) + return output + + +class ASP_OC_Module(nn.Module): + def __init__(self, features, out_features=256, dilations=(12, 24, 36)): + super(ASP_OC_Module, self).__init__() + self.context = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=3, padding=1, dilation=1, bias=True), + InPlaceABNSync(out_features), + BaseOC_Context_Module(in_channels=out_features, out_channels=out_features, + key_channels=out_features // 2, value_channels=out_features, + dropout=0, sizes=([2]))) + self.conv2 = nn.Sequential(nn.Conv2d(features, out_features, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(out_features)) + self.conv3 = nn.Sequential( + nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[0], dilation=dilations[0], bias=False), + InPlaceABNSync(out_features)) + self.conv4 = nn.Sequential( + nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[1], dilation=dilations[1], bias=False), + InPlaceABNSync(out_features)) + self.conv5 = nn.Sequential( + nn.Conv2d(features, out_features, kernel_size=3, padding=dilations[2], dilation=dilations[2], bias=False), + InPlaceABNSync(out_features)) + + self.conv_bn_dropout = nn.Sequential( + nn.Conv2d(out_features * 5, out_features, kernel_size=1, padding=0, dilation=1, bias=False), + InPlaceABNSync(out_features), + nn.Dropout2d(0.1) + ) + + def _cat_each(self, feat1, feat2, feat3, feat4, feat5): + assert (len(feat1) == len(feat2)) + z = [] + for i in range(len(feat1)): + z.append(torch.cat((feat1[i], feat2[i], feat3[i], feat4[i], feat5[i]), 1)) + return z + + def forward(self, x): + if isinstance(x, Variable): + _, _, h, w = x.size() + elif isinstance(x, tuple) or isinstance(x, list): + _, _, h, w = x[0].size() + else: + raise RuntimeError('unknown input type') + + feat1 = self.context(x) + feat2 = self.conv2(x) + feat3 = self.conv3(x) + feat4 = self.conv4(x) + feat5 = self.conv5(x) + + if isinstance(x, Variable): + out = torch.cat((feat1, feat2, feat3, feat4, feat5), 1) + elif isinstance(x, tuple) or isinstance(x, list): + out = self._cat_each(feat1, feat2, feat3, feat4, feat5) + else: + raise RuntimeError('unknown input type') + output = self.conv_bn_dropout(out) + return output diff --git a/Leffa/preprocess/humanparsing/networks/context_encoding/psp.py b/Leffa/preprocess/humanparsing/networks/context_encoding/psp.py new file mode 100644 index 0000000000000000000000000000000000000000..47181dc3f5fddb1c7fb80ad58a6694aae9ebd746 --- /dev/null +++ b/Leffa/preprocess/humanparsing/networks/context_encoding/psp.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +""" +@Author : Peike Li +@Contact : peike.li@yahoo.com +@File : psp.py +@Time : 8/4/19 3:36 PM +@Desc : +@License : This source code is licensed under the license found in the + LICENSE file in the root directory of this source tree. +""" + +import torch +import torch.nn as nn +from torch.nn import functional as F + +from modules import InPlaceABNSync + + +class PSPModule(nn.Module): + """ + Reference: + Zhao, Hengshuang, et al. *"Pyramid scene parsing network."* + """ + def __init__(self, features, out_features=512, sizes=(1, 2, 3, 6)): + super(PSPModule, self).__init__() + + self.stages = [] + self.stages = nn.ModuleList([self._make_stage(features, out_features, size) for size in sizes]) + self.bottleneck = nn.Sequential( + nn.Conv2d(features + len(sizes) * out_features, out_features, kernel_size=3, padding=1, dilation=1, + bias=False), + InPlaceABNSync(out_features), + ) + + def _make_stage(self, features, out_features, size): + prior = nn.AdaptiveAvgPool2d(output_size=(size, size)) + conv = nn.Conv2d(features, out_features, kernel_size=1, bias=False) + bn = InPlaceABNSync(out_features) + return nn.Sequential(prior, conv, bn) + + def forward(self, feats): + h, w = feats.size(2), feats.size(3) + priors = [F.interpolate(input=stage(feats), size=(h, w), mode='bilinear', align_corners=True) for stage in + self.stages] + [feats] + bottle = self.bottleneck(torch.cat(priors, 1)) + return bottle \ No newline at end of file