| | import os |
| | import re |
| | import sys |
| | import os.path as osp |
| | import torch |
| | import random |
| | import logging |
| | import hashlib |
| | import warnings |
| | from tqdm import tqdm |
| | from datetime import datetime |
| | from itertools import product |
| | from tqdm.auto import tqdm as tq |
| | from typing import Any, List, Tuple, Union |
| | from torch_geometric.data import InMemoryDataset |
| | from torch_geometric.data.dataset import files_exist |
| | from torch_geometric.data.makedirs import makedirs |
| | from torch_geometric.data.dataset import _repr |
| | from torch_geometric.nn.pool.consecutive import consecutive_cluster |
| |
|
| | from src.data import NAG |
| | from src.transforms import Transform, NAGSelectByKey, NAGRemoveKeys, \ |
| | SampleXYTiling, SampleRecursiveMainXYAxisTiling |
| | from src.visualization import show |
| |
|
| | DIR = os.path.dirname(os.path.realpath(__file__)) |
| | log = logging.getLogger(__name__) |
| |
|
| |
|
| | __all__ = ['BaseDataset'] |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | class BaseDataset(InMemoryDataset): |
| | """Base class for datasets. |
| | |
| | Child classes must overwrite the following methods (see respective |
| | docstrings for more details): |
| | |
| | ``` |
| | MyDataset(BaseDataset): |
| | |
| | def class_names(self): |
| | pass |
| | |
| | def num_classes(self): |
| | pass |
| | |
| | def stuff_classes(self): |
| | pass |
| | |
| | def class_colors(self): |
| | # Optional: only if you want to customize your color palette |
| | # for visualization |
| | pass |
| | |
| | def all_base_cloud_ids(self): |
| | pass |
| | |
| | def download_dataset(self): |
| | pass |
| | |
| | def read_single_raw_cloud(self): |
| | pass |
| | |
| | def raw_file_structure(self): |
| | # Optional: only if your raw or processed file structure |
| | # differs from the default |
| | pass |
| | |
| | def id_to_relative_raw_path(self): |
| | # Optional: only if your raw or processed file structure |
| | # differs from the default |
| | pass |
| | |
| | def processed_to_raw_path(self): |
| | # Optional: only if your raw or processed file structure |
| | # differs from the default |
| | pass |
| | ``` |
| | |
| | |
| | Parameters |
| | ---------- |
| | root : `str` |
| | Root directory where the dataset should be saved. |
| | stage : {'train', 'val', 'test', 'trainval'} |
| | transform : `callable` |
| | transform function operating on data. |
| | pre_transform : `callable` |
| | pre_transform function operating on data. |
| | pre_filter : `callable` |
| | pre_filter function operating on data. |
| | on_device_transform: `callable` |
| | on_device_transform function operating on data, in the |
| | 'on_after_batch_transfer' hook. This is where GPU-based |
| | augmentations should be, as well as any Transform you do not |
| | want to run in CPU-based DataLoaders |
| | val_mixed_in_train: bool |
| | whether the 'val' stage data is saved in the same clouds as the |
| | 'train' stage. This may happen when the stage splits are |
| | performed inside the clouds. In this case, an |
| | `on_device_transform` will be automatically created to separate |
| | stage-specific data upon reading |
| | test_mixed_in_val: bool |
| | whether the 'test' stage data is saved in the same clouds as the |
| | 'val' stage. This may happen when the stage splits are |
| | performed inside the clouds. In this case, an |
| | `on_device_transform` will be automatically created to separate |
| | stage-specific data upon reading |
| | custom_hash: str |
| | A user-chosen hash to be used for the dataset data directory. |
| | This will bypass the default behavior where the pre_transforms |
| | are used to generate a hash. It can be used, for instance, when |
| | one wants to instantiate a dataset with already-processed data, |
| | without knowing the exact config that was used to generate it |
| | in_memory: bool |
| | If True, the processed dataset will be entirely loaded in RAM |
| | upon instantiation. This will accelerate training and inference |
| | but requires large memory. WARNING: __getitem__ directly |
| | returns the data in memory, so any modification to the returned |
| | object will affect the `in_memory_data` too. Be careful to clone |
| | the object before modifying it. Besides, the `transform` are |
| | pre-applied to the in_memory data |
| | point_save_keys: list[str] |
| | List of point (ie level-0) attribute keys to save to disk at |
| | the end of preprocessing. Leaving to `None` will save all |
| | attributes by default |
| | point_no_save_keys: list[str] |
| | List of point (ie level-0) attribute keys to NOT save to disk at |
| | the end of preprocessing |
| | point_load_keys: list[str] |
| | List of point (ie level-0) attribute keys to load when reading |
| | data from disk |
| | segment_save_keys: list[str] |
| | List of segment (ie level-1+) attribute keys to save to disk |
| | at the end of preprocessing. Leaving to `None` will save all |
| | attributes by default |
| | segment_no_save_keys: list[str] |
| | List of segment (ie level-1+) attribute keys to NOT save to disk |
| | at the end of preprocessing |
| | segment_load_keys: list[str] |
| | List of segment (ie level-1+) attribute keys to load when |
| | reading data from disk |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | root: str, |
| | stage: str = 'train', |
| | transform: Transform = None, |
| | pre_transform: Transform = None, |
| | pre_filter: Transform = None, |
| | on_device_transform: Transform = None, |
| | save_y_to_csr: bool = True, |
| | save_pos_dtype: torch.dtype = torch.float, |
| | save_fp_dtype: torch.dtype = torch.half, |
| | xy_tiling: int = None, |
| | pc_tiling: int = None, |
| | val_mixed_in_train: bool = False, |
| | test_mixed_in_val: bool = False, |
| | custom_hash: str = None, |
| | in_memory: bool = False, |
| | point_save_keys: List[str] = None, |
| | point_no_save_keys: List[str] = None, |
| | point_load_keys: List[str] = None, |
| | segment_save_keys: List[str] = None, |
| | segment_no_save_keys: List[str] = None, |
| | segment_load_keys: List[str] = None, |
| | **kwargs): |
| |
|
| | assert stage in ['train', 'val', 'trainval', 'test'] |
| |
|
| | |
| | |
| | |
| | self._stage = stage |
| | self._save_y_to_csr = save_y_to_csr |
| | self._save_pos_dtype = save_pos_dtype |
| | self._save_fp_dtype = save_fp_dtype |
| | self._on_device_transform = on_device_transform |
| | self._val_mixed_in_train = val_mixed_in_train |
| | self._test_mixed_in_val = test_mixed_in_val |
| | self._custom_hash = custom_hash |
| | self._in_memory = in_memory |
| | self._point_save_keys = point_save_keys |
| | self._point_no_save_keys = point_no_save_keys |
| | self._point_load_keys = point_load_keys |
| | self._segment_save_keys = segment_save_keys |
| | self._segment_no_save_keys = segment_no_save_keys |
| | self._segment_load_keys = segment_load_keys |
| |
|
| | if in_memory: |
| | log.warning( |
| | "'in_memory' was set to True. This means the entire dataset " |
| | "will be held in RAM. While this allows training and inference " |
| | "speedups, this means that the `transform' will only be " |
| | "applied once, upon loading the dataset to RAM. Hence, if you " |
| | "need augmentations or any other stochastic operations to be " |
| | "applied on your batches, make sure you moved them all to " |
| | "'on_device_transform'.") |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | assert xy_tiling is None or pc_tiling is None, \ |
| | "Cannot apply both XY and PC tiling, please choose only one." |
| | if xy_tiling is None: |
| | self.xy_tiling = None |
| | elif isinstance(xy_tiling, int): |
| | self.xy_tiling = (xy_tiling, xy_tiling) if xy_tiling > 1 else None |
| | elif xy_tiling[0] > 1 or xy_tiling[1] > 1: |
| | self.xy_tiling = xy_tiling |
| | else: |
| | self.xy_tiling = None |
| | self.pc_tiling = pc_tiling if pc_tiling and pc_tiling >= 1 else None |
| |
|
| | |
| | |
| | |
| | self.check_cloud_ids() |
| |
|
| | |
| | root = osp.join(root, self.data_subdir_name) |
| | super().__init__(root, transform, pre_transform, pre_filter) |
| |
|
| | |
| | path = osp.join(self.processed_dir, "<stage>", self.pre_transform_hash) |
| | log.info(f'Dataset hash: "{self.pre_transform_hash}"') |
| | log.info(f'Preprocessed data can be found at: "{path}"') |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | if self.stage == 'train' and self.val_mixed_in_train: |
| | t = NAGSelectByKey(key='is_val', negation=True) |
| | elif self.stage == 'val' and self.val_mixed_in_train or self.test_mixed_in_val: |
| | t = NAGSelectByKey(key='is_val', negation=False) |
| | elif self.stage == 'test' and self.test_mixed_in_val: |
| | t = NAGSelectByKey(key='is_val', negation=True) |
| | else: |
| | t = NAGRemoveKeys(level='all', keys=['is_val'], strict=False) |
| |
|
| | |
| | |
| | if not any( |
| | isinstance(odt, NAGSelectByKey) and odt.key == 'is_val' |
| | for odt in self.on_device_transform.transforms): |
| | self._on_device_transform.transforms = \ |
| | [t] + self._on_device_transform.transforms |
| |
|
| | |
| | if self.in_memory: |
| | in_memory_data = [ |
| | NAG.load( |
| | self.processed_paths[i], |
| | keys_low=self.point_load_keys, |
| | keys=self.segment_load_keys) |
| | for i in range(len(self))] |
| | if self.transform is not None: |
| | in_memory_data = [self.transform(x) for x in in_memory_data] |
| | self._in_memory_data = in_memory_data |
| | else: |
| | self._in_memory_data = None |
| |
|
| | @property |
| | def class_names(self) -> List[str]: |
| | """List of string names for dataset classes. This list must be |
| | one-item larger than `self.num_classes`, with the last label |
| | corresponding to 'void', 'unlabelled', 'ignored' classes, |
| | indicated as `y=self.num_classes` in the dataset labels. |
| | """ |
| | raise NotImplementedError |
| |
|
| | @property |
| | def num_classes(self) -> int: |
| | """Number of classes in the dataset. Must be one-item smaller |
| | than `self.class_names`, to account for the last class name |
| | being used for 'void', 'unlabelled', 'ignored' classes, |
| | indicated as `y=self.num_classes` in the dataset labels. |
| | """ |
| | raise NotImplementedError |
| |
|
| | @property |
| | def stuff_classes(self) -> List[int]: |
| | """List of 'stuff' labels for INSTANCE and PANOPTIC |
| | SEGMENTATION (setting this is NOT REQUIRED FOR SEMANTIC |
| | SEGMENTATION alone). By definition, 'stuff' labels are labels in |
| | `[0, self.num_classes-1]` which are not 'thing' labels. |
| | |
| | In instance segmentation, 'stuff' classes are not taken into |
| | account in performance metrics computation. |
| | |
| | In panoptic segmentation, 'stuff' classes are taken into account |
| | in performance metrics computation. Besides, each cloud/scene |
| | can only have at most one instance of each 'stuff' class. |
| | |
| | IMPORTANT: |
| | By convention, we assume `y ∈ [0, self.num_classes-1]` ARE ALL |
| | VALID LABELS (i.e. not 'ignored', 'void', 'unknown', etc), while |
| | `y < 0` AND `y >= self.num_classes` ARE VOID LABELS. |
| | """ |
| | raise NotImplementedError |
| |
|
| | @property |
| | def thing_classes(self) -> List[int]: |
| | """List of 'thing' labels for instance and panoptic |
| | segmentation. By definition, 'thing' labels are labels in |
| | `[0, self.num_classes-1]` which are not 'stuff' labels. |
| | |
| | IMPORTANT: |
| | By convention, we assume `y ∈ [0, self.num_classes-1]` ARE ALL |
| | VALID LABELS (i.e. not 'ignored', 'void', 'unknown', etc), while |
| | `y < 0` AND `y >= self.num_classes` ARE VOID LABELS. |
| | """ |
| | return [i for i in range(self.num_classes) if i not in self.stuff_classes] |
| |
|
| | @property |
| | def void_classes(self) -> List[int]: |
| | """List containing the 'void' labels. By default, we group all |
| | void/ignored/unknown class labels into a single |
| | `[self.num_classes]` label for simplicity. |
| | |
| | IMPORTANT: |
| | By convention, we assume `y ∈ [0, self.num_classes-1]` ARE ALL |
| | VALID LABELS (i.e. not 'ignored', 'void', 'unknown', etc), while |
| | `y < 0` AND `y >= self.num_classes` ARE VOID LABELS. |
| | """ |
| | return [self.num_classes] |
| |
|
| | @property |
| | def class_colors(self) -> List[List[int]]: |
| | """Colors for visualization, if not None, must have the same |
| | length as `self.num_classes`. If None, the visualizer will use |
| | the label values in the data to generate random colors. |
| | """ |
| | return |
| |
|
| | def print_classes(self) -> None: |
| | """Show the class names, labels and type (thing, stuff, void). |
| | """ |
| | for i, c in enumerate(self.class_names): |
| | try: |
| | class_type = \ |
| | 'stuff' if i in self.stuff_classes \ |
| | else 'thing' if i in self.thing_classes \ |
| | else 'void' |
| | except: |
| | class_type = '' |
| | print(f"{i:<3} {c:<20} {class_type}") |
| |
|
| | @property |
| | def data_subdir_name(self) -> str: |
| | return self.__class__.__name__.lower() |
| |
|
| | @property |
| | def stage(self) -> str: |
| | """Dataset stage. Expected to be 'train', 'val', 'trainval', |
| | or 'test' |
| | """ |
| | return self._stage |
| | |
| | @property |
| | def save_y_to_csr(self) -> bool: |
| | return self._save_y_to_csr |
| |
|
| | @property |
| | def save_pos_dtype(self) -> bool: |
| | return self._save_pos_dtype |
| |
|
| | @property |
| | def save_fp_dtype(self) -> bool: |
| | return self._save_fp_dtype |
| |
|
| | @property |
| | def on_device_transform(self) -> Transform: |
| | return self._on_device_transform |
| |
|
| | @property |
| | def val_mixed_in_train(self) -> bool: |
| | return self._val_mixed_in_train |
| |
|
| | @property |
| | def test_mixed_in_val(self) -> bool: |
| | return self._test_mixed_in_val |
| |
|
| | @property |
| | def custom_hash(self) -> str: |
| | return self._custom_hash |
| |
|
| | @property |
| | def in_memory(self) -> bool: |
| | return self._in_memory |
| |
|
| | @property |
| | def point_save_keys(self) -> List[str]: |
| | return self._point_save_keys |
| |
|
| | @property |
| | def point_no_save_keys(self) -> List[str]: |
| | return self._point_no_save_keys |
| |
|
| | @property |
| | def point_load_keys(self) -> List[str]: |
| | return self._point_load_keys |
| |
|
| | @property |
| | def segment_save_keys(self) -> List[str]: |
| | return self._segment_save_keys |
| |
|
| | @property |
| | def segment_no_save_keys(self) -> List[str]: |
| | return self._segment_no_save_keys |
| |
|
| | @property |
| | def segment_load_keys(self) -> List[str]: |
| | return self._segment_load_keys |
| | |
| | @property |
| | def all_base_cloud_ids(self) -> List[str]: |
| | """Dictionary holding lists of clouds ids, for each |
| | stage. |
| | |
| | The following structure is expected: |
| | `{'train': [...], 'val': [...], 'test': [...]}` |
| | """ |
| | raise NotImplementedError |
| |
|
| | @property |
| | def all_cloud_ids(self) -> List[str]: |
| | """Dictionary holding lists of clouds ids, for each |
| | stage. Unlike all_base_cloud_ids, these ids take into account |
| | the clouds tiling, if any. |
| | """ |
| | |
| | |
| | if self.xy_tiling is not None: |
| | tx, ty = self.xy_tiling |
| | return { |
| | stage: [ |
| | f'{ci}__TILE_{x + 1}-{y + 1}_OF_{tx}-{ty}' |
| | for ci in ids |
| | for x, y in product(range(tx), range(ty))] |
| | for stage, ids in self.all_base_cloud_ids.items()} |
| |
|
| | if self.pc_tiling is not None: |
| | return { |
| | stage: [ |
| | f'{ci}__TILE_{x + 1}_OF_{2**self.pc_tiling}' |
| | for ci in ids |
| | for x in range(2**self.pc_tiling)] |
| | for stage, ids in self.all_base_cloud_ids.items()} |
| |
|
| | |
| | return self.all_base_cloud_ids |
| |
|
| | def id_to_base_id(self, id: str) -> str: |
| | """Given an ID, remove the tiling indications, if any. |
| | """ |
| | if self.xy_tiling is None and self.pc_tiling is None: |
| | return id |
| | return self.get_tile_from_path(id)[1] |
| |
|
| | @property |
| | def cloud_ids(self) -> List[str]: |
| | """IDs of the dataset clouds, based on its `stage`. |
| | """ |
| | if self.stage == 'trainval': |
| | ids = self.all_cloud_ids['train'] + self.all_cloud_ids['val'] |
| | else: |
| | ids = self.all_cloud_ids[self.stage] |
| | return sorted(list(set(ids))) |
| |
|
| | def check_cloud_ids(self) -> None: |
| | """Make sure the `all_cloud_ids` are valid. More specifically, |
| | the cloud ids must be unique across all stages, unless |
| | `val_mixed_in_train=True` or `test_mixed_in_val=True`, in |
| | which case some clouds may appear in several stages |
| | """ |
| | train = set(self.all_cloud_ids['train']) |
| | val = set(self.all_cloud_ids['val']) |
| | test = set(self.all_cloud_ids['test']) |
| |
|
| | assert len(train.intersection(val)) == 0 or self.val_mixed_in_train, \ |
| | "Cloud ids must be unique across all the 'train' and 'val' " \ |
| | "stages, unless `val_mixed_in_train=True`" |
| | assert len(val.intersection(test)) == 0 or self.test_mixed_in_val, \ |
| | "Cloud ids must be unique across all the 'val' and 'test' " \ |
| | "stages, unless `test_mixed_in_val=True`" |
| |
|
| | @property |
| | def raw_file_structure(self) -> str: |
| | """String to describe to the user the file structure of your |
| | dataset, at download time. |
| | """ |
| | return |
| |
|
| | @property |
| | def raw_file_names(self) -> str: |
| | """The file paths to find in order to skip the download.""" |
| | return self.raw_file_names_3d |
| |
|
| | @property |
| | def raw_file_names_3d(self) -> str: |
| | """Some file paths to find in order to skip the download. |
| | Those are not directly specified inside `self.raw_file_names` |
| | in case `self.raw_file_names` would need to be extended (e.g. |
| | with 3D bounding boxes files). |
| | """ |
| | return [self.id_to_relative_raw_path(x) for x in self.cloud_ids] |
| |
|
| | def id_to_relative_raw_path(self, id: str) -> str: |
| | """Given a cloud id as stored in `self.cloud_ids`, return the |
| | path (relative to `self.raw_dir`) of the corresponding raw |
| | cloud. |
| | """ |
| | return self.id_to_base_id(id) + '.ply' |
| |
|
| | @property |
| | def pre_transform_hash(self) -> str: |
| | """Produce a unique but stable hash based on the dataset's |
| | `pre_transform` attributes (as exposed by `_repr`). |
| | """ |
| | if self.custom_hash is not None: |
| | return self.custom_hash |
| | if self.pre_transform is None: |
| | return 'no_pre_transform' |
| | return hashlib.md5(_repr(self.pre_transform).encode()).hexdigest() |
| |
|
| | @property |
| | def processed_file_names(self) -> List[str]: |
| | """The name of the files to find in the `self.processed_dir` |
| | folder in order to skip the processing |
| | """ |
| | |
| | |
| | if self.stage == 'trainval' and self.val_mixed_in_train: |
| | return [ |
| | osp.join('train', self.pre_transform_hash, f'{w}.h5') |
| | for s in ('train', 'val') |
| | for w in self.all_cloud_ids[s]] |
| | if self.stage == 'trainval': |
| | return [ |
| | osp.join(s, self.pre_transform_hash, f'{w}.h5') |
| | for s in ('train', 'val') |
| | for w in self.all_cloud_ids[s]] |
| | return [ |
| | osp.join(self.stage, self.pre_transform_hash, f'{w}.h5') |
| | for w in self.cloud_ids] |
| |
|
| | def processed_to_raw_path(self, processed_path: str) -> str: |
| | """Given a processed cloud path from `self.processed_paths`, |
| | return the absolute path to the corresponding raw cloud. |
| | |
| | Overwrite this method if your raw data does not follow the |
| | default structure. |
| | """ |
| | |
| | stage, hash_dir, cloud_id = \ |
| | osp.splitext(processed_path)[0].split(os.sep)[-3:] |
| |
|
| | |
| | base_cloud_id = self.id_to_base_id(cloud_id) |
| |
|
| | |
| | raw_ext = osp.splitext(self.raw_file_names_3d[0])[1] |
| | raw_path = osp.join(self.raw_dir, base_cloud_id + raw_ext) |
| |
|
| | return raw_path |
| |
|
| | @property |
| | def in_memory_data(self) -> Any: |
| | """If the `self.in_memory`, this will return all processed data, |
| | loaded in memory. Returns None otherwise. |
| | """ |
| | return self._in_memory_data |
| |
|
| | @property |
| | def submission_dir(self) -> str: |
| | """Submissions are saved in the `submissions` folder, in the |
| | same hierarchy as `raw` and `processed` directories. Each |
| | submission has a subdirectory of its own, named based on the |
| | date and time of creation. |
| | """ |
| | submissions_dir = osp.join(self.root, "submissions") |
| | date = '-'.join([ |
| | f'{getattr(datetime.now(), x)}' |
| | for x in ['year', 'month', 'day']]) |
| | time = '-'.join([ |
| | f'{getattr(datetime.now(), x)}' |
| | for x in ['hour', 'minute', 'second']]) |
| | submission_name = f'{date}_{time}' |
| | path = osp.join(submissions_dir, submission_name) |
| | return path |
| |
|
| | def download(self) -> None: |
| | self.download_warning() |
| | self.download_dataset() |
| |
|
| | def download_dataset(self) -> None: |
| | """Download the dataset data. Modify this method to implement |
| | your own `BaseDataset` child class. |
| | """ |
| | raise NotImplementedError |
| |
|
| | def download_warning(self, interactive: bool = False) -> None: |
| | |
| | log.info( |
| | f"WARNING: You must download the raw data for the " |
| | f"{self.__class__.__name__} dataset.") |
| | if self.raw_file_structure is not None: |
| | log.info("Files must be organized in the following structure:") |
| | log.info(self.raw_file_structure) |
| | log.info("") |
| | if interactive: |
| | log.info("Press any key to continue, or CTRL-C to exit.") |
| | input("") |
| | log.info("") |
| |
|
| | def download_message(self, msg: str) -> None: |
| | log.info(f'Downloading "{msg}" to {self.raw_dir}...') |
| |
|
| | def _process(self) -> None: |
| | """Overwrites torch-geometric's Dataset._process. This simply |
| | removes the 'pre_transform.pt' file used for checking whether |
| | the pre-transforms have changed. This is possible thanks to our |
| | `pre_transform_hash` mechanism. |
| | """ |
| | f = osp.join(self.processed_dir, 'pre_filter.pt') |
| | if osp.exists(f) and torch.load(f) != _repr(self.pre_filter): |
| | warnings.warn( |
| | "The `pre_filter` argument differs from the one used in " |
| | "the pre-processed version of this dataset. If you want to " |
| | "make use of another pre-filtering technique, make sure to " |
| | "delete '{self.processed_dir}' first") |
| |
|
| | if files_exist(self.processed_paths): |
| | return |
| |
|
| | if self.log and 'pytest' not in sys.modules: |
| | print('Processing...', file=sys.stderr) |
| |
|
| | makedirs(self.processed_dir) |
| | self.process() |
| |
|
| | path = osp.join(self.processed_dir, 'pre_filter.pt') |
| | torch.save(_repr(self.pre_filter), path) |
| |
|
| | if self.log and 'pytest' not in sys.modules: |
| | print('Done!', file=sys.stderr) |
| |
|
| | def process(self) -> None: |
| | |
| | |
| | |
| | |
| | |
| | hash_dir = self.pre_transform_hash |
| | train_dir = osp.join(self.processed_dir, 'train', hash_dir) |
| | val_dir = osp.join(self.processed_dir, 'val', hash_dir) |
| | test_dir = osp.join(self.processed_dir, 'test', hash_dir) |
| | if not osp.exists(train_dir): |
| | os.makedirs(train_dir, exist_ok=True) |
| | if not osp.exists(val_dir): |
| | if self.val_mixed_in_train: |
| | os.makedirs(osp.dirname(val_dir), exist_ok=True) |
| | os.symlink(train_dir, val_dir, target_is_directory=True) |
| | else: |
| | os.makedirs(val_dir, exist_ok=True) |
| | if not osp.exists(test_dir): |
| | if self.test_mixed_in_val: |
| | os.makedirs(osp.dirname(test_dir), exist_ok=True) |
| | os.symlink(val_dir, test_dir, target_is_directory=True) |
| | else: |
| | os.makedirs(test_dir, exist_ok=True) |
| |
|
| | |
| | for p in tq(self.processed_paths): |
| | self._process_single_cloud(p) |
| |
|
| | def _process_single_cloud(self, cloud_path: str) -> None: |
| | """Internal method called by `self.process` to preprocess a |
| | single cloud of 3D points. |
| | """ |
| | |
| | if osp.exists(cloud_path): |
| | return |
| |
|
| | |
| | os.makedirs(osp.dirname(cloud_path), exist_ok=True) |
| |
|
| | |
| | |
| | raw_path = self.processed_to_raw_path(cloud_path) |
| | data = self.sanitized_read_single_raw_cloud(raw_path) |
| |
|
| | |
| | if self.xy_tiling is not None: |
| | tile = self.get_tile_from_path(cloud_path)[0] |
| | data = SampleXYTiling(x=tile[0], y=tile[1], tiling=tile[2])(data) |
| | elif self.pc_tiling is not None: |
| | tile = self.get_tile_from_path(cloud_path)[0] |
| | data = SampleRecursiveMainXYAxisTiling(x=tile[0], steps=tile[1])(data) |
| |
|
| | |
| | if self.pre_transform is not None: |
| | nag = self.pre_transform(data) |
| | else: |
| | nag = NAG([data]) |
| |
|
| | |
| | if self.point_save_keys is not None: |
| | keys = set(nag[0].keys) - set(self.point_save_keys) |
| | nag = NAGRemoveKeys(level=0, keys=keys)(nag) |
| | elif self.point_no_save_keys is not None: |
| | nag = NAGRemoveKeys(level=0, keys=self.point_no_save_keys)(nag) |
| | if self.segment_save_keys is not None: |
| | keys = set(nag[1].keys) - set(self.segment_save_keys) |
| | nag = NAGRemoveKeys(level='1+', keys=keys)(nag) |
| | elif self.segment_no_save_keys is not None: |
| | nag = NAGRemoveKeys(level='1+', keys=self.segment_no_save_keys)(nag) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | nag.save( |
| | cloud_path, |
| | y_to_csr=self.save_y_to_csr, |
| | pos_dtype=self.save_pos_dtype, |
| | fp_dtype=self.save_fp_dtype) |
| | del nag |
| |
|
| | @staticmethod |
| | def get_tile_from_path(path: str) -> Tuple[Tuple, str, str]: |
| | |
| | out_reg = re.search('__TILE_(\d+)-(\d+)_OF_(\d+)-(\d+)', path) |
| | if out_reg is not None: |
| | x, y, x_tiling, y_tiling = [int(g) for g in out_reg.groups()] |
| | suffix = f'__TILE_{x}-{y}_OF_{x_tiling}-{y_tiling}' |
| | prefix = path.replace(suffix, '') |
| | return (x - 1, y - 1, (x_tiling, y_tiling)), prefix, suffix |
| |
|
| | |
| | out_reg = re.search('__TILE_(\d+)_OF_(\d+)', path) |
| | if out_reg is not None: |
| | x, num = [int(g) for g in out_reg.groups()] |
| | suffix = f'__TILE_{x}_OF_{num}' |
| | prefix = path.replace(suffix, '') |
| | steps = torch.log2(torch.tensor(num)).int().item() |
| | return (x - 1, steps), prefix, suffix |
| |
|
| | return |
| |
|
| | def read_single_raw_cloud(self, raw_cloud_path: str) -> 'Data': |
| | """Read a single raw cloud and return a `Data` object, ready to |
| | be passed to `self.pre_transform`. |
| | |
| | This `Data` object should contain the following attributes: |
| | - `pos`: point coordinates |
| | - `y`: OPTIONAL point semantic label |
| | - `obj`: OPTIONAL `InstanceData` object with instance labels |
| | - `rgb`: OPTIONAL point color |
| | - `intensity`: OPTIONAL point LiDAR intensity |
| | |
| | IMPORTANT: |
| | By convention, we assume `y ∈ [0, self.num_classes-1]` ARE ALL |
| | VALID LABELS (i.e. not 'ignored', 'void', 'unknown', etc), |
| | while `y < 0` AND `y >= self.num_classes` ARE VOID LABELS. |
| | This applies to both `Data.y` and `Data.obj.y`. |
| | """ |
| | raise NotImplementedError |
| |
|
| | def sanitized_read_single_raw_cloud(self, raw_cloud_path: str) -> 'Data': |
| | """Wrapper around the actual `self.read_single_raw_cloud`. This |
| | function ensures that the semantic and instance segmentation |
| | labels returned by the reader are sanitized. |
| | |
| | More specifically, we assume `[0, self.num_classes-1]` ARE ALL |
| | VALID LABELS (i.e. not 'ignored', 'void', 'unknown', etc), |
| | while `y < 0` AND `y >= self.num_classes` ARE VOID LABELS. |
| | |
| | To this end, this function maps all labels outside |
| | `[0, self.num_classes-1]` to `y = self.num_classes`. |
| | |
| | Hence, we actually have `self.num_classes + 1` labels in the |
| | data. This allows identifying the points to be ignored at metric |
| | computation time. |
| | |
| | Besides, this function ensures that there is at most 1 instance |
| | of each stuff (and void) class in each scene/cloud/tile, as |
| | described in: |
| | - https://arxiv.org/abs/1801.00868 |
| | - https://arxiv.org/abs/1905.01220 |
| | """ |
| | data = self.read_single_raw_cloud(raw_cloud_path) |
| |
|
| | |
| | |
| | if getattr(data, 'y', None) is not None: |
| | data.y[data.y < 0] = self.num_classes |
| | data.y[data.y > self.num_classes] = self.num_classes |
| |
|
| | |
| | |
| | if getattr(data, 'obj', None) is not None: |
| | data.obj.y[data.obj.y < 0] = self.num_classes |
| | data.obj.y[data.obj.y > self.num_classes] = self.num_classes |
| |
|
| | |
| | |
| | for i in self.stuff_classes + self.void_classes: |
| | idx = torch.where(data.obj.y == i)[0] |
| | if idx.numel() == 0: |
| | continue |
| | data.obj.obj[idx] = data.obj.obj[idx].min() |
| |
|
| | return data |
| |
|
| | def debug_instance_data(self, level: int = 1) -> None: |
| | """Sanity check to make sure at most 1 instance of each stuff |
| | class per scene/cloud. |
| | |
| | :param level: int |
| | NAG level which to inspect |
| | """ |
| | problematic_clouds = [] |
| | for i_cloud, nag in tqdm(enumerate(self)): |
| | _, perm = consecutive_cluster(nag[level].obj.obj) |
| | y = nag[level].obj.y[perm] |
| | y_count = torch.bincount(y, minlength=self.num_classes + 1) |
| | for c in self.stuff_classes + self.void_classes: |
| | if y_count[c] > 1: |
| | problematic_clouds.append(i_cloud) |
| | break |
| |
|
| | assert len(problematic_clouds) == 0, \ |
| | f"The following clouds have more than 1 instance of for a stuff " \ |
| | f"or void class:\n{problematic_clouds}" |
| |
|
| | def get_class_weight(self, smooth: str='sqrt') -> torch.Tensor: |
| | """Compute class weights based on the labels distribution in the |
| | dataset. Optionally a 'smooth' function may be passed to |
| | smoothen the weights' statistics. |
| | """ |
| | assert smooth in [None, 'sqrt', 'log'] |
| |
|
| | |
| | |
| | nag = self[0] |
| | low = nag.num_levels - 1 |
| |
|
| | |
| | if nag[low].y is None: |
| | return None |
| | del nag |
| |
|
| | |
| | |
| | counts = torch.zeros(self.num_classes) |
| | for i in range(len(self)): |
| | if self.in_memory: |
| | y = self.in_memory_data[i][low].y |
| | else: |
| | y = NAG.load( |
| | self.processed_paths[i], low=low, keys_low=['y'])[0].y |
| | counts += y.sum(dim=0)[:self.num_classes] |
| |
|
| | |
| | |
| | if smooth == 'sqrt': |
| | counts = counts.sqrt() |
| | if smooth == 'log': |
| | counts = counts.log() |
| |
|
| | weights = 1 / (counts + 1) |
| | weights /= weights.sum() |
| |
|
| | return weights |
| |
|
| | def __len__(self) -> int: |
| | """Number of clouds in the dataset.""" |
| | return len(self.cloud_ids) |
| |
|
| | def __getitem__(self, idx: int) -> Union['NAG', 'Data']: |
| | """Load a preprocessed NAG from disk and apply `self.transform` |
| | if any. Optionally, one may pass a tuple (idx, bool) where the |
| | boolean indicates whether the data should be loaded from disk, if |
| | `self.in_memory=True`. |
| | """ |
| | |
| | from_hdd = False |
| | if isinstance(idx, tuple): |
| | assert len(idx) == 2 and isinstance(idx[1], bool), \ |
| | "Only supports indexing with `int` or `(int, bool)` where the" \ |
| | " boolean indicates whether the data should be loaded from " \ |
| | "disk, when `self.in_memory=True`." |
| | idx, from_hdd = idx |
| |
|
| | |
| | if self.in_memory and not from_hdd: |
| | |
| | |
| | |
| | return self.in_memory_data[idx] |
| |
|
| | |
| | nag = NAG.load( |
| | self.processed_paths[idx], |
| | keys_low=self.point_load_keys, |
| | keys=self.segment_load_keys) |
| |
|
| | |
| | nag = nag if self.transform is None else self.transform(nag) |
| |
|
| | return nag |
| |
|
| | def make_submission( |
| | self, |
| | idx: int, |
| | pred: torch.Tensor, |
| | pos: torch.Tensor, |
| | submission_dir: str = None |
| | ) -> None: |
| | """Implement this if your dataset needs to produce data in a |
| | given format for submission. This is typically needed for |
| | datasets with held-out test sets. |
| | """ |
| | raise NotImplementedError |
| |
|
| | def finalize_submission(self, submission_dir: str) -> None: |
| | """Implement this if your dataset needs to produce data in a |
| | given format for submission. This is typically needed for |
| | datasets with held-out test sets. |
| | """ |
| | raise NotImplementedError |
| |
|
| | def show_examples( |
| | self, |
| | label: int, |
| | radius: float = 4, |
| | max_examples: int = 5, |
| | shuffle: bool = True, |
| | **kwargs |
| | ) -> None: |
| | """Interactive plots of some examples centered on points of the |
| | provided `label`. At most one example per cloud/tile/scene in |
| | the dataset will be shown. |
| | |
| | :param label: int or str |
| | Label of the class of interest, may be provided as an int or |
| | a string corresponding to the class name |
| | :param radius: float |
| | Radius of the spherical sampling to draw around the point of |
| | interest |
| | :param max_examples: int |
| | Maximum number of samples to draw |
| | :param shuffle: bool |
| | If True, the candidate samples will be shuffled every time |
| | :param kwargs: |
| | Kwargs to be passed to the visualization `show()` function |
| | :return: |
| | """ |
| | if isinstance(label, str): |
| | assert label in self.class_names, \ |
| | f"Label must be within {self.class_names}]" |
| | label = self.class_names.index(label) |
| | |
| | assert label >= 0 and label <= self.num_classes, \ |
| | f"Label must be within [0, {self.num_classes + 1}]" |
| |
|
| | |
| | cloud_list = [] |
| | iterator = list(range(len(self))) |
| | if shuffle: |
| | random.shuffle(iterator) |
| | for i_cloud in iterator: |
| | if len(cloud_list) >= max_examples: |
| | break |
| | if (self[i_cloud][1].y.argmax(dim=1) == label).any(): |
| | cloud_list.append(i_cloud) |
| |
|
| | |
| | if len(cloud_list) == 0: |
| | print( |
| | f"Could not find any cloud with points of label={label} in the " |
| | f"dataset.") |
| | return |
| |
|
| | |
| | for i, i_cloud in enumerate(cloud_list): |
| | if i >= max_examples: |
| | break |
| |
|
| | |
| | nag = self[i_cloud] |
| |
|
| | |
| | point_idx = torch.where(nag[0].y.argmax(dim=1) == label)[0].tolist() |
| |
|
| | |
| | |
| | if shuffle: |
| | random.shuffle(point_idx) |
| | i_point = point_idx[0] |
| |
|
| | |
| | center = nag[0].pos[i_point].cpu().tolist() |
| | title = f"Label={label} - Cloud={i_cloud} - Center={center}" |
| | print(f"\n{title}") |
| | show( |
| | nag, |
| | center=center, |
| | radius=radius, |
| | title=title, |
| | class_names=self.class_names, |
| | class_colors=self.class_colors, |
| | stuff_classes=self.stuff_classes, |
| | num_classes=self.num_classes, |
| | **kwargs) |
| |
|