tuandunghcmut commited on Apr 11, 2025

Commit

141c79f

verified ·

1 Parent(s): f15b370

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
Groma/mmdet/models/backbones/__pycache__/csp_darknet.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/darknet.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/detectors_resnet.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/detectors_resnext.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/hourglass.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/hrnet.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/mobilenet_v2.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/regnet.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/res2net.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/resnest.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/resnet.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/resnext.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/ssd_vgg.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/swin.cpython-39.pyc +0 -0
Groma/mmdet/models/backbones/__pycache__/trident_resnet.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/__init__.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/accuracy.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/ae_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/balanced_l1_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/cross_entropy_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/dice_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/focal_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/gaussian_focal_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/gfocal_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/ghm_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/iou_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/kd_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/mse_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/pisa_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/seesaw_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/smooth_l1_loss.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/utils.cpython-39.pyc +0 -0
Groma/mmdet/models/losses/__pycache__/varifocal_loss.cpython-39.pyc +0 -0
Groma/mmdet/utils/__pycache__/__init__.cpython-39.pyc +0 -0
Groma/mmdet/utils/__pycache__/collect_env.cpython-39.pyc +0 -0
Groma/mmdet/utils/__pycache__/contextmanagers.cpython-39.pyc +0 -0
Groma/mmdet/utils/__pycache__/logger.cpython-39.pyc +0 -0
Groma/mmdet/utils/__pycache__/misc.cpython-39.pyc +0 -0
Groma/mmdet/utils/__pycache__/setup_env.cpython-39.pyc +0 -0
Groma/mmdet/utils/__pycache__/util_mixins.cpython-39.pyc +0 -0
OpenSeeD/datasets/__init__.py +2 -0
OpenSeeD/datasets/build.py +638 -0
OpenSeeD/datasets/dataset_mappers/__init__.py +14 -0
OpenSeeD/datasets/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py +191 -0
OpenSeeD/datasets/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py +166 -0
OpenSeeD/datasets/dataset_mappers/imagenet_dataset_mapper.py +95 -0
OpenSeeD/datasets/dataset_mappers/lvis_dataset_mapper.py +170 -0
OpenSeeD/datasets/dataset_mappers/mask_former_instance_dataset_mapper.py +184 -0
OpenSeeD/datasets/dataset_mappers/mask_former_panoptic_dataset_mapper.py +168 -0

.gitattributes CHANGED Viewed

@@ -584,3 +584,10 @@ Groma/mmcv/docs/en/_static/flow_warp.png filter=lfs diff=lfs merge=lfs -text
 Groma/mmcv/docs/en/_static/flow_raw_images.png filter=lfs diff=lfs merge=lfs -text
 Groma/mmcv/docs/en/_static/zhihu_qrcode.jpg filter=lfs diff=lfs merge=lfs -text
 Groma/mmcv/docs/en/_static/community/3.png filter=lfs diff=lfs merge=lfs -text

 Groma/mmcv/docs/en/_static/flow_raw_images.png filter=lfs diff=lfs merge=lfs -text
 Groma/mmcv/docs/en/_static/zhihu_qrcode.jpg filter=lfs diff=lfs merge=lfs -text
 Groma/mmcv/docs/en/_static/community/3.png filter=lfs diff=lfs merge=lfs -text
+OpenSeeD/figs/results1.jpg filter=lfs diff=lfs merge=lfs -text
+OpenSeeD/figs/framework.jpg filter=lfs diff=lfs merge=lfs -text
+OpenSeeD/figs/cover.jpg filter=lfs diff=lfs merge=lfs -text
+OpenSeeD/figs/results2.jpg filter=lfs diff=lfs merge=lfs -text
+OpenSeeD/figs/intro.jpg filter=lfs diff=lfs merge=lfs -text
+OpenSeeD/images/animals.png filter=lfs diff=lfs merge=lfs -text
+OpenSeeD/images/street.jpg filter=lfs diff=lfs merge=lfs -text

Groma/mmdet/models/backbones/__pycache__/csp_darknet.cpython-39.pyc ADDED Viewed

Binary file (9.09 kB). View file

Groma/mmdet/models/backbones/__pycache__/darknet.cpython-39.pyc ADDED Viewed

Binary file (7.27 kB). View file

Groma/mmdet/models/backbones/__pycache__/detectors_resnet.cpython-39.pyc ADDED Viewed

Binary file (9.53 kB). View file

Groma/mmdet/models/backbones/__pycache__/detectors_resnext.cpython-39.pyc ADDED Viewed

Binary file (2.94 kB). View file

Groma/mmdet/models/backbones/__pycache__/hourglass.cpython-39.pyc ADDED Viewed

Binary file (6.32 kB). View file

Groma/mmdet/models/backbones/__pycache__/hrnet.cpython-39.pyc ADDED Viewed

Binary file (13.5 kB). View file

Groma/mmdet/models/backbones/__pycache__/mobilenet_v2.cpython-39.pyc ADDED Viewed

Binary file (5.84 kB). View file

Groma/mmdet/models/backbones/__pycache__/regnet.cpython-39.pyc ADDED Viewed

Binary file (11.1 kB). View file

Groma/mmdet/models/backbones/__pycache__/res2net.cpython-39.pyc ADDED Viewed

Binary file (8.79 kB). View file

Groma/mmdet/models/backbones/__pycache__/resnest.cpython-39.pyc ADDED Viewed

Binary file (8.9 kB). View file

Groma/mmdet/models/backbones/__pycache__/resnet.cpython-39.pyc ADDED Viewed

Binary file (17.4 kB). View file

Groma/mmdet/models/backbones/__pycache__/resnext.cpython-39.pyc ADDED Viewed

Binary file (4.72 kB). View file

Groma/mmdet/models/backbones/__pycache__/ssd_vgg.cpython-39.pyc ADDED Viewed

Binary file (4.36 kB). View file

Groma/mmdet/models/backbones/__pycache__/swin.cpython-39.pyc ADDED Viewed

Binary file (22.4 kB). View file

Groma/mmdet/models/backbones/__pycache__/trident_resnet.cpython-39.pyc ADDED Viewed

Binary file (9.44 kB). View file

Groma/mmdet/models/losses/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (1.59 kB). View file

Groma/mmdet/models/losses/__pycache__/accuracy.cpython-39.pyc ADDED Viewed

Binary file (3.22 kB). View file

Groma/mmdet/models/losses/__pycache__/ae_loss.cpython-39.pyc ADDED Viewed

Binary file (3.6 kB). View file

Groma/mmdet/models/losses/__pycache__/balanced_l1_loss.cpython-39.pyc ADDED Viewed

Binary file (4.1 kB). View file

Groma/mmdet/models/losses/__pycache__/cross_entropy_loss.cpython-39.pyc ADDED Viewed

Binary file (7.61 kB). View file

Groma/mmdet/models/losses/__pycache__/dice_loss.cpython-39.pyc ADDED Viewed

Binary file (4.86 kB). View file

Groma/mmdet/models/losses/__pycache__/focal_loss.cpython-39.pyc ADDED Viewed

Binary file (7.34 kB). View file

Groma/mmdet/models/losses/__pycache__/gaussian_focal_loss.cpython-39.pyc ADDED Viewed

Binary file (3.33 kB). View file

Groma/mmdet/models/losses/__pycache__/gfocal_loss.cpython-39.pyc ADDED Viewed

Binary file (8.41 kB). View file

Groma/mmdet/models/losses/__pycache__/ghm_loss.cpython-39.pyc ADDED Viewed

Binary file (6.33 kB). View file

Groma/mmdet/models/losses/__pycache__/iou_loss.cpython-39.pyc ADDED Viewed

Binary file (12.5 kB). View file

Groma/mmdet/models/losses/__pycache__/kd_loss.cpython-39.pyc ADDED Viewed

Binary file (2.91 kB). View file

Groma/mmdet/models/losses/__pycache__/mse_loss.cpython-39.pyc ADDED Viewed

Binary file (2.13 kB). View file

Groma/mmdet/models/losses/__pycache__/pisa_loss.cpython-39.pyc ADDED Viewed

Binary file (4.42 kB). View file

Groma/mmdet/models/losses/__pycache__/seesaw_loss.cpython-39.pyc ADDED Viewed

Binary file (7.76 kB). View file

Groma/mmdet/models/losses/__pycache__/smooth_l1_loss.cpython-39.pyc ADDED Viewed

Binary file (3.95 kB). View file

Groma/mmdet/models/losses/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (2.76 kB). View file

Groma/mmdet/models/losses/__pycache__/varifocal_loss.cpython-39.pyc ADDED Viewed

Binary file (4.77 kB). View file

Groma/mmdet/utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (358 Bytes). View file

Groma/mmdet/utils/__pycache__/collect_env.cpython-39.pyc ADDED Viewed

Binary file (589 Bytes). View file

Groma/mmdet/utils/__pycache__/contextmanagers.cpython-39.pyc ADDED Viewed

Binary file (3.55 kB). View file

Groma/mmdet/utils/__pycache__/logger.cpython-39.pyc ADDED Viewed

Binary file (649 Bytes). View file

Groma/mmdet/utils/__pycache__/misc.cpython-39.pyc ADDED Viewed

Binary file (1.17 kB). View file

Groma/mmdet/utils/__pycache__/setup_env.cpython-39.pyc ADDED Viewed

Binary file (1.49 kB). View file

Groma/mmdet/utils/__pycache__/util_mixins.cpython-39.pyc ADDED Viewed

Binary file (3.75 kB). View file

OpenSeeD/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import registration
2	+ from .build import *

OpenSeeD/datasets/build.py ADDED Viewed

	@@ -0,0 +1,638 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+import itertools
+import logging
+import copy
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+import torch.utils.data
+import torch.utils.data as torchdata
+import detectron2.utils.comm as comm
+from detectron2.data.build import (
+    build_batch_data_loader,
+    load_proposals_into_dataset,
+    trivial_batch_collator,
+)
+from detectron2.data import MetadataCatalog
+from detectron2.data.catalog import DatasetCatalog
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.samplers import InferenceSampler, TrainingSampler
+from detectron2.evaluation import (
+    CityscapesInstanceEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    DatasetEvaluators,
+    LVISEvaluator,
+    verify_results,
+)
+from fvcore.common.config import CfgNode
+from omegaconf import DictConfig, OmegaConf
+from .dataset_mappers import (
+    COCOInstanceNewBaselineDatasetMapper,
+    COCOPanopticNewBaselineDatasetMapper,
+    MaskFormerInstanceDatasetMapper,
+    MaskFormerPanopticDatasetMapper,
+    MaskFormerSemanticDatasetMapper,
+    ImageNetDatasetMapper,
+    VLPreDatasetMapper,
+    SunRGBDSegDatasetMapper,
+    ScanNetSegDatasetMapper,
+    BDDSemDatasetMapper,
+    ScanNetPanoDatasetMapper,
+    RefCOCODatasetMapper,
+    O365InstanceNewBaselineDatasetMapper,
+)
+from .evaluation import (InstanceSegEvaluator,
+                         SemSegEvaluator,
+                         COCOPanopticEvaluator,
+)
+from openseed.utils import configurable
+from detectron2.utils.comm import get_world_size
+from typing import Any, Dict, List, Set
+class JointLoader(torchdata.IterableDataset):
+    def __init__(self, loaders, key_dataset):
+        dataset_names = []
+        for key, loader in loaders.items():
+            name = "{}".format(key.split('_')[0])
+            setattr(self, name, loader)
+            dataset_names += [name]
+        self.dataset_names = dataset_names
+        self.key_dataset = key_dataset
+    def __iter__(self):
+        for batch in zip(*[getattr(self, name) for name in self.dataset_names]):
+            yield {key: batch[i] for i, key in enumerate(self.dataset_names)}
+    def __len__(self):
+        return len(getattr(self, self.key_dataset))
+def filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names):
+    """
+    Filter out images with none annotations or only crowd annotations
+    (i.e., images without non-crowd annotations).
+    A common training-time preprocessing on COCO dataset.
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+    Returns:
+        list[dict]: the same format, but filtered.
+    """
+    num_before = len(dataset_dicts)
+    def valid(anns):
+        for ann in anns:
+            if isinstance(ann, list):
+                for instance in ann:
+                    if instance.get("iscrowd", 0) == 0:
+                        return True
+            else:
+                if ann.get("iscrowd", 0) == 0:
+                    return True
+        return False
+    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images with no usable annotations. {} images left.".format(
+            num_before - num_after, num_after
+        )
+    )
+    return dataset_dicts
+def get_detection_dataset_dicts(
+    dataset_names, filter_empty=True, proposal_files=None
+):
+    """
+    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+    Args:
+        dataset_names (str or list[str]): a dataset name or a list of dataset names
+        filter_empty (bool): whether to filter out images without instance annotations
+        proposal_files (list[str]): if given, a list of object proposal files
+            that match each dataset in `dataset_names`.
+    Returns:
+        list[dict]: a list of dicts following the standard dataset dict format.
+    """
+    if isinstance(dataset_names, str):
+        dataset_names = [dataset_names]
+    assert len(dataset_names)
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
+    for dataset_name, dicts in zip(dataset_names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+    if proposal_files is not None:
+        assert len(dataset_names) == len(proposal_files)
+        # load precomputed proposals from proposal files
+        dataset_dicts = [
+            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+        ]
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts, dataset_names)
+    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(dataset_names))
+    return dataset_dicts
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+    """
+    Uses the given `dataset_name` argument (instead of the names in cfg), because the
+    standard practice is to evaluate each test set individually (not combining them).
+    """
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+    dataset = get_detection_dataset_dicts(
+        dataset_name,
+        filter_empty=False,
+        proposal_files=None,
+    )
+    # import ipdb;ipdb.set_trace()
+    if mapper is None:
+        if isinstance(cfg, (DictConfig)):
+            cfg = OmegaConf.to_container(copy.deepcopy(cfg))
+        mapper_cfg = CfgNode({'INPUT': cfg['INPUT'], 'MODEL': cfg['MODEL'], 'DATASETS': cfg['DATASETS']})
+        mapper = DatasetMapper(mapper_cfg, False)
+    assert cfg['TEST']['BATCH_SIZE_TOTAL'] % get_world_size() == 0, "Evaluation total batchsize is not divisible by gpu number"
+    batch_size = cfg['TEST']['BATCH_SIZE_TOTAL'] // get_world_size()
+    return {
+        "dataset": dataset,
+        "mapper": mapper,
+        "num_workers": cfg['DATALOADER']['NUM_WORKERS'],
+        "sampler": InferenceSampler(len(dataset)),
+        "batch_size": batch_size,
+    }
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(
+    dataset: Union[List[Any], torchdata.Dataset],
+    *,
+    mapper: Callable[[Dict[str, Any]], Any],
+    sampler: Optional[torchdata.Sampler] = None,
+    batch_size: int = 1,
+    num_workers: int = 0,
+    collate_fn: Optional[Callable[[List[Any]], Any]] = None,
+) -> torchdata.DataLoader:
+    """
+    Similar to `build_detection_train_loader`, with default batch size = 1,
+    and sampler = :class:`InferenceSampler`. This sampler coordinates all workers
+    to produce the exact set of all samples.
+    Args:
+        dataset: a list of dataset dicts,
+            or a pytorch dataset (either map-style or iterable). They can be obtained
+            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper: a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+        sampler: a sampler that produces
+            indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
+            which splits the dataset across all workers. Sampler must be None
+            if `dataset` is iterable.
+        batch_size: the batch size of the data loader to be created.
+            Default to 1 image per worker since this is the standard when reporting
+            inference time in papers.
+        num_workers: number of parallel data loading workers
+        collate_fn: same as the argument of `torch.utils.data.DataLoader`.
+            Defaults to do no collation and return a list of data.
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+    Examples:
+    ::
+        data_loader = build_detection_test_loader(
+            DatasetRegistry.get("my_test"),
+            mapper=DatasetMapper(...))
+        # or, instantiate with a CfgNode:
+        data_loader = build_detection_test_loader(cfg, "my_test")
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if isinstance(dataset, torchdata.IterableDataset):
+        assert sampler is None, "sampler must be None if dataset is IterableDataset"
+    else:
+        if sampler is None:
+            sampler = InferenceSampler(len(dataset))
+    return torchdata.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        drop_last=False,
+        num_workers=num_workers,
+        collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
+    )
+def _train_loader_from_config(cfg, dataset_name, mapper, *, dataset=None, sampler=None):
+    cfg_datasets = cfg['DATASETS']
+    cfg_dataloader = cfg['DATALOADER']
+    if dataset is None:
+        dataset = get_detection_dataset_dicts(
+            dataset_name,
+            filter_empty=cfg_dataloader['FILTER_EMPTY_ANNOTATIONS'],
+            proposal_files=cfg_datasets['PROPOSAL_FILES_TRAIN'] if cfg_dataloader['LOAD_PROPOSALS'] else None,
+        )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    if sampler is None:
+        sampler_name = cfg_dataloader['SAMPLER_TRAIN']
+        logger = logging.getLogger(__name__)
+        logger.info("Using training sampler {}".format(sampler_name))
+        sampler = TrainingSampler(len(dataset))
+    return {
+        "dataset": dataset,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg['TRAIN']['BATCH_SIZE_TOTAL'],
+        "aspect_ratio_grouping": cfg_dataloader['ASPECT_RATIO_GROUPING'],
+        "num_workers": cfg_dataloader['NUM_WORKERS'],
+    }
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(
+    dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0
+):
+    """
+    Build a dataloader for object detection with some default features.
+    This interface is experimental.
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a map-style pytorch dataset. They can be obtained by using
+            :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that
+            produces indices to be applied on ``dataset``.
+            Default to :class:`TrainingSampler`, which coordinates a random shuffle
+            sequence across all workers.
+        total_batch_size (int): total batch size across all workers. Batching
+            simply puts data into a list.
+        aspect_ratio_grouping (bool): whether to group images with similar
+            aspect ratio for efficiency. When enabled, it requires each
+            element in dataset be a dict with keys "width" and "height".
+        num_workers (int): number of parallel data loading workers
+    Returns:
+        torch.utils.data.DataLoader: a dataloader. Each output from it is a
+            ``list[mapped_element]`` of length ``total_batch_size / num_workers``,
+            where ``mapped_element`` is produced by the ``mapper``.
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = TrainingSampler(len(dataset))
+    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
+    return build_batch_data_loader(
+        dataset,
+        sampler,
+        total_batch_size,
+        aspect_ratio_grouping=aspect_ratio_grouping,
+        num_workers=num_workers,
+    )
+def get_config_from_name(cfg, dataset_name):
+    # adjust config according to dataset
+    if 'refcoco' in dataset_name:
+        cfg.update(cfg['REF'])
+        return cfg
+    elif 'coco' in dataset_name:
+        if 'COCO' in cfg.keys():
+            cfg.update(cfg['COCO'])
+        return cfg
+    elif 'ade' in dataset_name:
+        if 'ADE20K' in cfg.keys():
+            cfg.update(cfg['ADE20K'])
+        return cfg
+    elif 'imagenet' in dataset_name:
+        if 'IMAGENET' in cfg.keys():
+            cfg.update(cfg['IMAGENET'])
+        return cfg
+    elif 'vlp' in dataset_name:
+        cfg.update(cfg['VLP'])
+        return cfg
+    elif 'sun' in dataset_name:
+        cfg.update(cfg['SUN'])
+        return cfg
+    elif 'object365' in dataset_name:
+        cfg.update(cfg['OBJECT365'])
+        return cfg
+    elif 'scan' in dataset_name:
+        cfg.update(cfg['SCAN'])
+        return cfg
+    elif 'cityscape' in dataset_name:
+        cfg.update(cfg['CITY'])
+        return cfg
+    elif 'bdd' in dataset_name:
+        cfg.update(cfg['BDD'])
+        return cfg
+    else:
+        assert False, "dataset not support."
+def build_eval_dataloader(cfg, ):
+    dataloaders = []
+    cfg = copy.deepcopy(cfg)
+    for dataset_name in cfg['DATASETS']['TEST']:
+        cfg = get_config_from_name(cfg, dataset_name)
+        # adjust mapper according to dataset
+        if dataset_name == 'imagenet_val':
+            mapper = ImageNetDatasetMapper(cfg, False)
+        elif dataset_name == 'bdd10k_val_sem_seg':
+            mapper = BDDSemDatasetMapper(cfg, False)
+        elif dataset_name in ["vlp_val", "vlp_captioning_val", "vlp_val2017", "vlp_captioning_val2017"]:
+            mapper = VLPreDatasetMapper(cfg, False, dataset_name)
+        elif dataset_name in ["scannet_21_val_seg", "scannet_38_val_seg", "scannet_41_val_seg"]:
+            mapper = ScanNetSegDatasetMapper(cfg, False)
+        elif dataset_name in ["scannet_21_panoptic_val", 'bdd10k_40_panoptic_val']:
+            mapper = ScanNetPanoDatasetMapper(cfg, False)
+        elif 'sun' in dataset_name:
+            mapper = SunRGBDSegDatasetMapper(cfg, False)
+        elif 'refcoco' in dataset_name:
+            mapper = RefCOCODatasetMapper(cfg, False)
+        else:
+            mapper = None
+        dataloaders += [build_detection_test_loader(cfg, dataset_name, mapper=mapper)]
+        # dataloaders = build_detection_test_loader(cfg, dataset_name, mapper=mapper)
+    return dataloaders
+def build_train_dataloader(cfg, ):
+    dataset_names = cfg['DATASETS']['TRAIN']
+    loaders = {}
+    cfg = copy.deepcopy(cfg)
+    for dataset_name in dataset_names:
+        cfg = get_config_from_name(cfg, dataset_name)
+        mapper_name = cfg['INPUT']['DATASET_MAPPER_NAME']
+        # Semantic segmentation dataset mapper
+        if mapper_name == "mask_former_semantic":
+            mapper = MaskFormerSemanticDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        # Panoptic segmentation dataset mapper
+        elif mapper_name == "mask_former_panoptic":   # TODO: Hack for ade training; should add ade name
+            mapper = MaskFormerPanopticDatasetMapper(cfg, True)
+            loaders['ade'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        # Instance segmentation dataset mapper
+        elif mapper_name == "mask_former_instance":
+            mapper = MaskFormerInstanceDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        # coco instance segmentation lsj new baseline
+        elif mapper_name == "coco_instance_lsj":
+            mapper = COCOInstanceNewBaselineDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        # coco panoptic segmentation lsj new baseline
+        elif mapper_name == "coco_panoptic_lsj":
+            mapper = COCOPanopticNewBaselineDatasetMapper(cfg, True)
+            loaders['coco'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        elif mapper_name == "object365":
+            mapper = O365InstanceNewBaselineDatasetMapper(cfg, True)  # Use lsj instance mapper for o365
+            loaders['o365'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        elif mapper_name == "vlpretrain":
+            mapper = VLPreDatasetMapper(cfg, True, dataset_name)
+            loaders['vlp'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        elif mapper_name == "refcoco":
+            mapper = RefCOCODatasetMapper(cfg, True)
+            loaders['ref'] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+        else:
+            mapper = None
+            loaders[dataset_name] = build_detection_train_loader(cfg, dataset_name=dataset_name, mapper=mapper)
+    # import ipdb; ipdb.set_trace()
+    if len(loaders) == 1 and not cfg['LOADER'].get('JOINT', False):
+        for k, v in loaders.items():
+            print("number of iterations per epoch: ", v, len(loaders[k]))
+        return list(loaders.values())[0]
+        # return loaders.values()['coco']
+        # return loaders['coco']
+    else:
+        return JointLoader(loaders, key_dataset=cfg['LOADER'].get('KEY_DATASET', 'coco'))
+def build_evaluator(cfg, dataset_name, output_folder=None):
+    """
+    Create evaluator(s) for a given dataset.
+    This uses the special metadata "evaluator_type" associated with each
+    builtin dataset. For your own dataset, you can simply create an
+    evaluator manually in your script and do not have to worry about the
+    hacky if-else logic here.
+    """
+    if output_folder is None:
+        output_folder = os.path.join(cfg["OUTPUT_DIR"], "inference")
+    evaluator_list = []
+    evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+    # semantic segmentation
+    if evaluator_type in ["sem_seg", "ade20k_panoptic_seg"]:
+        evaluator_list.append(
+            SemSegEvaluator(
+                dataset_name,
+                distributed=True,
+                output_dir=output_folder,
+            )
+        )
+    # instance segmentation
+    if evaluator_type == "coco":
+        evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
+    cfg_model_decoder_test = cfg["MODEL"]["DECODER"]["TEST"]
+    # panoptic segmentation
+    if evaluator_type in [
+        "coco_panoptic_seg",
+        "ade20k_panoptic_seg",
+        "cityscapes_panoptic_seg",
+        "mapillary_vistas_panoptic_seg",
+        "scannet_panoptic_seg",
+        "bdd_panoptic_pano"
+    ]:
+        if cfg_model_decoder_test["PANOPTIC_ON"]:
+            evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder))
+    # COCO
+    if (evaluator_type == "coco_panoptic_seg" and cfg_model_decoder_test["INSTANCE_ON"]) or evaluator_type == "object365_od":
+        evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))
+    if (evaluator_type == "coco_panoptic_seg" and cfg_model_decoder_test["SEMANTIC_ON"]) or evaluator_type == "coco_sem_seg":
+        evaluator_list.append(SemSegEvaluator(dataset_name, distributed=True, output_dir=output_folder))
+    # Mapillary Vistas
+    if evaluator_type == "mapillary_vistas_panoptic_seg" and cfg_model_decoder_test["INSTANCE_ON"]:
+        evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder))
+    if evaluator_type == "mapillary_vistas_panoptic_seg" and cfg_model_decoder_test["SEMANTIC_ON"]:
+        evaluator_list.append(SemSegEvaluator(dataset_name, distributed=True, output_dir=output_folder))
+    # Cityscapes
+    if evaluator_type == "cityscapes_instance":
+        assert (
+            torch.cuda.device_count() > comm.get_rank()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        return CityscapesInstanceEvaluator(dataset_name)
+    if evaluator_type == "cityscapes_sem_seg":
+        assert (
+            torch.cuda.device_count() > comm.get_rank()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        return CityscapesSemSegEvaluator(dataset_name)
+    if evaluator_type == "cityscapes_panoptic_seg":
+        if cfg_model_decoder_test["SEMANTIC_ON"]:
+            assert (
+                torch.cuda.device_count() > comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            evaluator_list.append(CityscapesSemSegEvaluator(dataset_name))
+        if cfg_model_decoder_test["INSTANCE_ON"]:
+            assert (
+                torch.cuda.device_count() > comm.get_rank()
+            ), "CityscapesEvaluator currently do not work with multiple machines."
+            evaluator_list.append(CityscapesInstanceEvaluator(dataset_name))
+    # ADE20K
+    if evaluator_type == "ade20k_panoptic_seg" and cfg_model_decoder_test["INSTANCE_ON"]:
+        evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder))
+    # SEGINW
+    if evaluator_type == "seginw" and cfg_model_decoder_test["INSTANCE_ON"]:
+        evaluator_list.append(InstanceSegEvaluator(dataset_name, output_dir=output_folder))
+    # LVIS
+    if evaluator_type == "lvis":
+        return LVISEvaluator(dataset_name, output_dir=output_folder)
+    # Classification
+    if evaluator_type == "classification":
+        evaluator_list.append(ClassificationEvaluator(dataset_name, output_folder))
+    # Retrieval
+    if evaluator_type == "retrieval":
+        evaluator_list.append(RetrievalEvaluator(dataset_name, output_folder, cfg['MODEL']['DECODER']['RETRIEVAL']['ENSEMBLE']))
+    if evaluator_type == "captioning":
+        evaluator_list.append(CaptioningEvaluator(dataset_name, output_folder, MetadataCatalog.get(dataset_name).gt_json))
+    if evaluator_type in ["grounding_refcoco", "grounding_phrasecut"]:
+        evaluator_list.append(GroundingEvaluator(dataset_name))
+    if len(evaluator_list) == 0:
+        raise NotImplementedError(
+            "no Evaluator for the dataset {} with the type {}".format(
+                dataset_name, evaluator_type
+            )
+        )
+    elif len(evaluator_list) == 1:
+        return evaluator_list[0]
+    return DatasetEvaluators(evaluator_list)
+def build_optimizer(cls, cfg, model):
+    cfg_solver = cfg['SOLVER']
+    weight_decay_norm = cfg_solver['WEIGHT_DECAY_NORM']
+    weight_decay_embed = cfg_solver['WEIGHT_DECAY_EMBED']
+    weight_decay_bias = cfg_solver.get('WEIGHT_DECAY_BIAS', 0.0)
+    defaults = {}
+    defaults["lr"] = cfg_solver['BASE_LR']
+    defaults["weight_decay"] = cfg_solver['WEIGHT_DECAY']
+    norm_module_types = (
+        torch.nn.BatchNorm1d,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.nn.SyncBatchNorm,
+        # NaiveSyncBatchNorm inherits from BatchNorm2d
+        torch.nn.GroupNorm,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.LocalResponseNorm,
+    )
+    lr_multiplier = cfg['SOLVER']['LR_MULTIPLIER']
+    # for _module_name in model.module_names:
+    #     # parameters = self.raw_modules[module_name].get_training_parameters()
+    #     # self.optimizers[module_name] = optimizer_class(parameters, **optimizer_parameters)
+    #     # params = []
+    #     # for module_param_name, value in self.raw_modules[module_name].named_parameters(recurse=True):
+    params: List[Dict[str, Any]] = []
+    memo: Set[torch.nn.parameter.Parameter] = set()
+    for module_name, module in model.named_modules():
+        for module_param_name, value in module.named_parameters(recurse=False):
+            if not value.requires_grad:
+                continue
+            # Avoid duplicating parameters
+            if value in memo:
+                continue
+            memo.add(value)
+            hyperparams = copy.copy(defaults)
+            for key, lr_mul in lr_multiplier.items():
+                if key in "{}.{}".format(module_name, module_param_name):
+                    hyperparams["lr"] = hyperparams["lr"] * lr_mul
+                    if is_main_process():
+                        logger.info("Modify Learning rate of {}: {}".format(
+                            "{}.{}".format(module_name, module_param_name), lr_mul))
+            if (
+                    "relative_position_bias_table" in module_param_name
+                    or "absolute_pos_embed" in module_param_name
+            ):
+                hyperparams["weight_decay"] = 0.0
+            if isinstance(module, norm_module_types):
+                hyperparams["weight_decay"] = weight_decay_norm
+            if isinstance(module, torch.nn.Embedding):
+                hyperparams["weight_decay"] = weight_decay_embed
+            if "bias" in module_name:
+                hyperparams["weight_decay"] = weight_decay_bias
+            params.append({"params": [value], **hyperparams})
+    def maybe_add_full_model_gradient_clipping(optim):
+        # detectron2 doesn't have full model gradient clipping now
+        clip_norm_val = cfg_solver['CLIP_GRADIENTS']['CLIP_VALUE']
+        enable = (
+                cfg_solver['CLIP_GRADIENTS']['ENABLED']
+                and cfg_solver['CLIP_GRADIENTS']['CLIP_TYPE'] == "full_model"
+                and clip_norm_val > 0.0
+        )
+        class FullModelGradientClippingOptimizer(optim):
+            def step(self, closure=None):
+                all_params = itertools.chain(*[x["params"] for x in self.param_groups])
+                torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
+                super().step(closure=closure)
+        return FullModelGradientClippingOptimizer if enable else optim
+    optimizer_type = cfg_solver['OPTIMIZER']
+    if optimizer_type == "SGD":
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
+            params, cfg_solver['BASE_LR'], momentum=cfg_solver['MOMENTUM']
+        )
+    elif optimizer_type == "ADAMW":
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
+            params, cfg_solver['BASE_LR']
+        )
+    else:
+        raise NotImplementedError(f"no optimizer type {optimizer_type}")
+    return optimizer

OpenSeeD/datasets/dataset_mappers/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from .coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
+from .coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
+from .mask_former_instance_dataset_mapper import MaskFormerInstanceDatasetMapper
+from .mask_former_panoptic_dataset_mapper import MaskFormerPanopticDatasetMapper
+from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
+from .imagenet_dataset_mapper import ImageNetDatasetMapper
+from .vlp_dataset_mapper import VLPreDatasetMapper
+from .sunrgbd_dataset_mapper import SunRGBDSegDatasetMapper
+from .scannet_dataset_mapper import ScanNetSegDatasetMapper
+from .bdd_semseg_dataset_mapper import BDDSemDatasetMapper
+from .scannet_pano_dataset_mapper import ScanNetPanoDatasetMapper
+from .refcoco_dataset_mapper import RefCOCODatasetMapper
+from .o365_instance_new_baseline_dataset_mapper import O365InstanceNewBaselineDatasetMapper

OpenSeeD/datasets/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
+import copy
+import logging
+import numpy as np
+import torch
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.data.transforms import TransformGen
+from detectron2.structures import BitMasks, Instances
+from pycocotools import mask as coco_mask
+from openseed.utils import configurable
+__all__ = ["COCOInstanceNewBaselineDatasetMapper"]
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    assert is_train, "Only support training augmentation"
+    cfg_input = cfg['INPUT']
+    image_size = cfg_input['IMAGE_SIZE']
+    min_scale = cfg_input['MIN_SCALE']
+    max_scale = cfg_input['MAX_SCALE']
+    augmentation = []
+    if cfg_input['RANDOM_FLIP'] != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
+                vertical=cfg_input['RANDOM_FLIP'] == "vertical",
+            )
+        )
+    augmentation.extend([
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
+        ),
+        T.FixedSizeCrop(crop_size=(image_size, image_size)),
+    ])
+    return augmentation
+# This is specifically designed for the COCO dataset.
+class COCOInstanceNewBaselineDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer.
+    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        tfm_gens,
+        image_format,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            tfm_gens: data augmentation
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+        """
+        self.tfm_gens = tfm_gens
+        logging.getLogger(__name__).info(
+            "[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
+        )
+        self.img_format = image_format
+        self.is_train = is_train
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        tfm_gens = build_transform_gen(cfg, is_train)
+        ret = {
+            "is_train": is_train,
+            "tfm_gens": tfm_gens,
+            "image_format": cfg['INPUT']['FORMAT'],
+        }
+        return ret
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        # TODO: get padding mask
+        # by feeding a "segmentation mask" to the same transforms
+        padding_mask = np.ones(image.shape[:2])
+        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        # the crop transformation has default padding value 0 for segmentation
+        padding_mask = transforms.apply_segmentation(padding_mask)
+        padding_mask = ~ padding_mask.astype(bool)
+        image_shape = image.shape[:2]  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                # Let's always keep mask
+                # if not self.mask_on:
+                #     anno.pop("segmentation", None)
+                anno.pop("keypoints", None)
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                utils.transform_instance_annotations(obj, transforms, image_shape)
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            # NOTE: does not support BitMask due to augmentation
+            # Current BitMask cannot handle empty objects
+            instances = utils.annotations_to_instances(annos, image_shape)
+            # After transforms such as cropping are applied, the bounding box may no longer
+            # tightly bound the object. As an example, imagine a triangle object
+            # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
+            # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
+            # the intersection of original bounding box and the cropping box.
+            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+            # Need to filter empty instances first (due to augmentation)
+            instances = utils.filter_empty_instances(instances)
+            # Generate masks from polygon
+            h, w = instances.image_size
+            # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
+            if hasattr(instances, 'gt_masks'):
+                gt_masks = instances.gt_masks
+                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
+                instances.gt_masks = gt_masks
+            dataset_dict["instances"] = instances
+        return dataset_dict

OpenSeeD/datasets/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
+import copy
+import logging
+import numpy as np
+import torch
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.data.transforms import TransformGen
+from detectron2.structures import BitMasks, Boxes, Instances
+__all__ = ["COCOPanopticNewBaselineDatasetMapper"]
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    assert is_train, "Only support training augmentation"
+    image_size = cfg.INPUT.IMAGE_SIZE
+    min_scale = cfg.INPUT.MIN_SCALE
+    max_scale = cfg.INPUT.MAX_SCALE
+    augmentation = []
+    if cfg.INPUT.RANDOM_FLIP != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
+                vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
+            )
+        )
+    augmentation.extend([
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
+        ),
+        T.FixedSizeCrop(crop_size=(image_size, image_size)),
+    ])
+    return augmentation
+# This is specifically designed for the COCO dataset.
+class COCOPanopticNewBaselineDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer.
+    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        tfm_gens,
+        image_format,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            crop_gen: crop augmentation
+            tfm_gens: data augmentation
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+        """
+        self.tfm_gens = tfm_gens
+        logging.getLogger(__name__).info(
+            "[COCOPanopticNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
+                str(self.tfm_gens)
+            )
+        )
+        self.img_format = image_format
+        self.is_train = is_train
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        tfm_gens = build_transform_gen(cfg, is_train)
+        ret = {
+            "is_train": is_train,
+            "tfm_gens": tfm_gens,
+            "image_format": cfg.INPUT.FORMAT,
+        }
+        return ret
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        image_shape = image.shape[:2]  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+        if "pan_seg_file_name" in dataset_dict:
+            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
+            segments_info = dataset_dict["segments_info"]
+            # apply the same transformation to panoptic segmentation
+            pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
+            from panopticapi.utils import rgb2id
+            pan_seg_gt = rgb2id(pan_seg_gt)
+            instances = Instances(image_shape)
+            classes = []
+            masks = []
+            for segment_info in segments_info:
+                class_id = segment_info["category_id"]
+                if not segment_info["iscrowd"]:
+                    classes.append(class_id)
+                    masks.append(pan_seg_gt == segment_info["id"])
+            classes = np.array(classes)
+            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+            if len(masks) == 0:
+                # Some image does not have annotation (all ignored)
+                instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+                instances.gt_boxes = Boxes(torch.zeros((0, 4)))
+            else:
+                masks = BitMasks(
+                    torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+                )
+                instances.gt_masks = masks.tensor
+                instances.gt_boxes = masks.get_bounding_boxes()
+            dataset_dict["instances"] = instances
+        return dataset_dict

OpenSeeD/datasets/dataset_mappers/imagenet_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Modified by Xueyan Zou (xueyan@cs.wisc.edu)
+# --------------------------------------------------------
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+from PIL import Image
+# import logging
+import cv2
+import numpy as np
+import torch
+from torchvision import transforms
+from openseed.utils import configurable
+__all__ = ["ImageNetDatasetMapper"]
+# This is specifically designed for the COCO dataset.
+class ImageNetDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer.
+    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        size_train=None,
+        size_test=None,
+        size_crop=None,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            tfm_gens: data augmentation
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+        """
+        self.is_train = is_train
+        self.size_train = size_train
+        self.size_test = size_test
+        self.size_crop = size_crop
+        t = []
+        t.append(transforms.Resize(size_crop, interpolation=Image.BICUBIC))
+        t.append(transforms.CenterCrop(size_test))
+        self.transform = transforms.Compose(t)
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        ret = {
+            "is_train": is_train,
+            "size_train": cfg['INPUT']['SIZE_TRAIN'],
+            "size_test": cfg['INPUT']['SIZE_TEST'],
+            "size_crop": cfg['INPUT']['SIZE_CROP']
+        }
+        return ret
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        file_name = dataset_dict['file_name']
+        image = Image.open(file_name).convert('RGB')
+        if self.is_train == False:
+            image = self.transform(image)
+            image = torch.from_numpy(np.asarray(image).copy())
+            image = image.permute(2,0,1)
+        dataset_dict['image'] = image
+        dataset_dict['height'] = image.shape[1]
+        dataset_dict['width'] = image.shape[2]
+        return dataset_dict

OpenSeeD/datasets/dataset_mappers/lvis_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
+import copy
+import random
+import scipy.io
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+from pycocotools import mask
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.data import MetadataCatalog
+from ...Networks.Mask2Former.utils import configurable
+__all__ = ["LVISDatasetMapper"]
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    assert is_train, "Only support training augmentation"
+    cfg_input = cfg['INPUT']
+    image_size = cfg_input['IMAGE_SIZE']
+    min_scale = cfg_input['MIN_SCALE']
+    max_scale = cfg_input['MAX_SCALE']
+    augmentation = []
+    if cfg_input['RANDOM_FLIP'] != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=cfg_input['RANDOM_FLIP'] == "horizontal",
+                vertical=cfg_input['RANDOM_FLIP'] == "vertical",
+            )
+        )
+    augmentation.extend([
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
+        ),
+        T.FixedSizeCrop(crop_size=(image_size, image_size)),
+    ])
+    return augmentation
+# This is specifically designed for the COCO dataset.
+class LVISDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer.
+    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        tfm_gens=None,
+        image_format=None,
+        min_size_test=None,
+        max_size_test=None,
+        mean=None,
+        std=None,
+        max_len=None,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            tfm_gens: data augmentation
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+        """
+        self.tfm_gens = tfm_gens
+        self.img_format = image_format
+        self.is_train = is_train
+        self.min_size_test = min_size_test
+        self.max_size_test = max_size_test
+        self.pixel_mean = torch.tensor(mean)[:,None,None]
+        self.pixel_std = torch.tensor(std)[:,None,None]
+        self.max_grounding_num = max_len
+        t = []
+        t.append(transforms.Resize(self.min_size_test, interpolation=Image.BICUBIC))
+        self.transform = transforms.Compose(t)
+        self.categories = torch.load(MetadataCatalog.get('logistic').get('cat_root'))
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        if is_train:
+            tfm_gens = build_transform_gen(cfg, is_train)
+        else:
+            tfm_gens = None
+        ret = {
+            "is_train": is_train,
+            "tfm_gens": tfm_gens,
+            "image_format": cfg['INPUT']['FORMAT'],
+            "min_size_test": cfg['INPUT']['MIN_SIZE_TEST'],
+            "max_size_test": cfg['INPUT']['MAX_SIZE_TEST'],
+            "mean": cfg['INPUT']['PIXEL_MEAN'],
+            "std": cfg['INPUT']['PIXEL_STD'],
+            "max_len": cfg['MODEL']['DECODER']['GROUNDING']['MAX_LEN'],
+        }
+        return ret
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        file_name = dataset_dict['file_name']
+        if self.is_train == False:
+            assert False, "Only support training."
+        else:
+            image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+            utils.check_image_size(dataset_dict, image)
+            image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+            image_shape = image.shape[:2]  # h, w
+            dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+            assert len(dataset_dict['instance']) > 0
+            masks_grd = []
+            texts_grd = []
+            boxes_grd = []
+            hash_grd = []
+            for inst, label in zip(dataset_dict['instance'], dataset_dict['labels']):
+                rle = mask.frPyObjects(inst, dataset_dict['height'], dataset_dict['width'])
+                m = mask.decode(rle)
+                # sometimes there are multiple binary map (corresponding to multiple segs)
+                m = np.sum(m, axis=2)
+                m = m.astype(np.uint8)  # convert to np.uint8
+                m = transforms.apply_segmentation(m[:,:,None])[:,:,0]
+                masks_grd += [m]
+                label_names = self.categories[label]
+                rand_id = random.randint(0, len(label_names)-1)
+                texts_grd.append(label_names[rand_id].lower())
+                hash_grd.append(hash(label_names[rand_id].lower()))
+            indices = torch.randperm(len(hash_grd))[:self.max_grounding_num]
+            masks_grd = torch.from_numpy(np.stack(masks_grd))[indices]
+            boxes_grd = torch.tensor(boxes_grd)
+            texts_grd = np.array(texts_grd)[indices.numpy()].tolist()
+            hash_grd = np.array(hash_grd)[indices.numpy()].tolist()
+            groundings = {'masks': masks_grd, 'texts': texts_grd, 'hash': hash_grd, 'mode': 'text'}
+            dataset_dict["groundings"] = groundings
+        return dataset_dict

OpenSeeD/datasets/dataset_mappers/mask_former_instance_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from torch.nn import functional as F
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.projects.point_rend import ColorAugSSDTransform
+from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
+from openseed.utils import configurable
+__all__ = ["MaskFormerInstanceDatasetMapper"]
+class MaskFormerInstanceDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer for instance segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        augmentations,
+        image_format,
+        size_divisibility,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            size_divisibility: pad image size to be divisible by this value
+        """
+        self.is_train = is_train
+        self.tfm_gens = augmentations
+        self.img_format = image_format
+        self.size_divisibility = size_divisibility
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        cfg_input = cfg['INPUT']
+        augs = [
+            T.ResizeShortestEdge(
+                cfg_input['MIN_SIZE_TRAIN'],
+                cfg_input['MAX_SIZE_TRAIN'],
+                cfg_input['MIN_SIZE_TRAIN_SAMPLING'],
+            )
+        ]
+        cfg_input_crop = cfg_input['CROP']
+        if cfg_input_crop['ENABLED']:
+            augs.append(
+                T.RandomCrop(
+                    cfg_input_crop['TYPE'],
+                    cfg_input_crop['SIZE'],
+                )
+            )
+        if cfg_input['COLOR_AUG_SSD']:
+            augs.append(ColorAugSSDTransform(img_format=cfg_input['FORMAT']))
+        augs.append(T.RandomFlip())
+        ret = {
+            "is_train": is_train,
+            "augmentations": augs,
+            "image_format": cfg_input['FORMAT'],
+            "size_divisibility": cfg_input['SIZE_DIVISIBILITY'],
+        }
+        return ret
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        aug_input = T.AugInput(image)
+        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
+        image = aug_input.image
+        # transform instnace masks
+        assert "annotations" in dataset_dict
+        for anno in dataset_dict["annotations"]:
+            anno.pop("keypoints", None)
+        annos = [
+            utils.transform_instance_annotations(obj, transforms, image.shape[:2])
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+        if len(annos):
+            assert "segmentation" in annos[0]
+        segms = [obj["segmentation"] for obj in annos]
+        masks = []
+        for segm in segms:
+            if isinstance(segm, list):
+                # polygon
+                masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
+            elif isinstance(segm, dict):
+                # COCO RLE
+                masks.append(mask_util.decode(segm))
+            elif isinstance(segm, np.ndarray):
+                assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
+                    segm.ndim
+                )
+                # mask array
+                masks.append(segm)
+            else:
+                raise ValueError(
+                    "Cannot convert segmentation of type '{}' to BitMasks!"
+                    "Supported types are: polygons as list[list[float] or ndarray],"
+                    " COCO-style RLE as a dict, or a binary segmentation mask "
+                    " in a 2D numpy array of shape HxW.".format(type(segm))
+                )
+        # Pad image and segmentation label here!
+        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
+        classes = [int(obj["category_id"]) for obj in annos]
+        classes = torch.tensor(classes, dtype=torch.int64)
+        if self.size_divisibility > 0:
+            image_size = (image.shape[-2], image.shape[-1])
+            padding_size = [
+                0,
+                self.size_divisibility - image_size[1],
+                0,
+                self.size_divisibility - image_size[0],
+            ]
+            # pad image
+            image = F.pad(image, padding_size, value=128).contiguous()
+            # pad mask
+            masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
+        image_shape = (image.shape[-2], image.shape[-1])  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = image
+        # Prepare per-category binary masks
+        instances = Instances(image_shape)
+        instances.gt_classes = classes
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
+        else:
+            masks = BitMasks(torch.stack(masks))
+            instances.gt_masks = masks.tensor
+        dataset_dict["instances"] = instances
+        return dataset_dict

OpenSeeD/datasets/dataset_mappers/mask_former_panoptic_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import logging
+import numpy as np
+import torch
+from torch.nn import functional as F
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.structures import BitMasks, Instances
+from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
+from openseed.utils import configurable
+__all__ = ["MaskFormerPanopticDatasetMapper"]
+class MaskFormerPanopticDatasetMapper(MaskFormerSemanticDatasetMapper):
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer for panoptic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        augmentations,
+        image_format,
+        ignore_label,
+        size_divisibility,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            ignore_label: the label that is ignored to evaluation
+            size_divisibility: pad image size to be divisible by this value
+        """
+        super().__init__(
+            is_train,
+            augmentations=augmentations,
+            image_format=image_format,
+            ignore_label=ignore_label,
+            size_divisibility=size_divisibility,
+        )
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!"
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        # semantic segmentation
+        if "sem_seg_file_name" in dataset_dict:
+            # PyTorch transformation not implemented for uint16, so converting it to double first
+            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
+        else:
+            sem_seg_gt = None
+        # panoptic segmentation
+        if "pan_seg_file_name" in dataset_dict:
+            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
+            segments_info = dataset_dict["segments_info"]
+        else:
+            pan_seg_gt = None
+            segments_info = None
+        if pan_seg_gt is None:
+            raise ValueError(
+                "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
+                    dataset_dict["file_name"]
+                )
+            )
+        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
+        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
+        image = aug_input.image
+        if sem_seg_gt is not None:
+            sem_seg_gt = aug_input.sem_seg
+        # apply the same transformation to panoptic segmentation
+        pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
+        from panopticapi.utils import rgb2id
+        pan_seg_gt = rgb2id(pan_seg_gt)
+        # Pad image and segmentation label here!
+        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if sem_seg_gt is not None:
+            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
+        pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
+        if self.size_divisibility > 0:
+            image_size = (image.shape[-2], image.shape[-1])
+            padding_size = [
+                0,
+                self.size_divisibility - image_size[1],
+                0,
+                self.size_divisibility - image_size[0],
+            ]
+            image = F.pad(image, padding_size, value=128).contiguous()
+            if sem_seg_gt is not None:
+                sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
+            pan_seg_gt = F.pad(
+                pan_seg_gt, padding_size, value=0
+            ).contiguous()  # 0 is the VOID panoptic label
+        image_shape = (image.shape[-2], image.shape[-1])  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = image
+        if sem_seg_gt is not None:
+            dataset_dict["sem_seg"] = sem_seg_gt.long()
+        if "annotations" in dataset_dict:
+            raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
+        # Prepare per-category binary masks
+        pan_seg_gt = pan_seg_gt.numpy()
+        instances = Instances(image_shape)
+        classes = []
+        masks = []
+        for segment_info in segments_info:
+            class_id = segment_info["category_id"]
+            if not segment_info["iscrowd"]:
+                classes.append(class_id)
+                masks.append(pan_seg_gt == segment_info["id"])
+        classes = np.array(classes)
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+        else:
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+            )
+            instances.gt_masks = masks.tensor
+            instances.gt_boxes = masks.get_bounding_boxes()
+        dataset_dict["instances"] = instances
+        return dataset_dict