Spaces:

scfive
/

samewind

Build error

samewind / mmdet /datasets /samplers /multi_source_sampler.py

scfive

Resolve README.md conflict and continue rebase

e8f2571 10 months ago

8.58 kB

	# Copyright (c) OpenMMLab. All rights reserved.
	import itertools
	from typing import Iterator, List, Optional, Sized, Union

	import numpy as np
	import torch
	from mmengine.dataset import BaseDataset
	from mmengine.dist import get_dist_info, sync_random_seed
	from torch.utils.data import Sampler

	from mmdet.registry import DATA_SAMPLERS


	@DATA_SAMPLERS.register_module()
	class MultiSourceSampler(Sampler):
	r"""Multi-Source Infinite Sampler.

	According to the sampling ratio, sample data from different
	datasets to form batches.

	Args:
	dataset (Sized): The dataset.
	batch_size (int): Size of mini-batch.
	source_ratio (list[int \| float]): The sampling ratio of different
	source datasets in a mini-batch.
	shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
	seed (int, optional): Random seed. If None, set a random seed.
	Defaults to None.

	Examples:
	>>> dataset_type = 'ConcatDataset'
	>>> sub_dataset_type = 'CocoDataset'
	>>> data_root = 'data/coco/'
	>>> sup_ann = '../coco_semi_annos/instances_train2017.1@10.json'
	>>> unsup_ann = '../coco_semi_annos/' \
	>>> 'instances_train2017.1@10-unlabeled.json'
	>>> dataset = dict(type=dataset_type,
	>>> datasets=[
	>>> dict(
	>>> type=sub_dataset_type,
	>>> data_root=data_root,
	>>> ann_file=sup_ann,
	>>> data_prefix=dict(img='train2017/'),
	>>> filter_cfg=dict(filter_empty_gt=True, min_size=32),
	>>> pipeline=sup_pipeline),
	>>> dict(
	>>> type=sub_dataset_type,
	>>> data_root=data_root,
	>>> ann_file=unsup_ann,
	>>> data_prefix=dict(img='train2017/'),
	>>> filter_cfg=dict(filter_empty_gt=True, min_size=32),
	>>> pipeline=unsup_pipeline),
	>>> ])
	>>> train_dataloader = dict(
	>>> batch_size=5,
	>>> num_workers=5,
	>>> persistent_workers=True,
	>>> sampler=dict(type='MultiSourceSampler',
	>>> batch_size=5, source_ratio=[1, 4]),
	>>> batch_sampler=None,
	>>> dataset=dataset)
	"""

	def __init__(self,
	dataset: Sized,
	batch_size: int,
	source_ratio: List[Union[int, float]],
	shuffle: bool = True,
	seed: Optional[int] = None) -> None:

	assert hasattr(dataset, 'cumulative_sizes'),\
	f'The dataset must be ConcatDataset, but get {dataset}'
	assert isinstance(batch_size, int) and batch_size > 0, \
	'batch_size must be a positive integer value, ' \
	f'but got batch_size={batch_size}'
	assert isinstance(source_ratio, list), \
	f'source_ratio must be a list, but got source_ratio={source_ratio}'
	assert len(source_ratio) == len(dataset.cumulative_sizes), \
	'The length of source_ratio must be equal to ' \
	f'the number of datasets, but got source_ratio={source_ratio}'

	rank, world_size = get_dist_info()
	self.rank = rank
	self.world_size = world_size

	self.dataset = dataset
	self.cumulative_sizes = [0] + dataset.cumulative_sizes
	self.batch_size = batch_size
	self.source_ratio = source_ratio

	self.num_per_source = [
	int(batch_size * sr / sum(source_ratio)) for sr in source_ratio
	]
	self.num_per_source[0] = batch_size - sum(self.num_per_source[1:])

	assert sum(self.num_per_source) == batch_size, \
	'The sum of num_per_source must be equal to ' \
	f'batch_size, but get {self.num_per_source}'

	self.seed = sync_random_seed() if seed is None else seed
	self.shuffle = shuffle
	self.source2inds = {
	source: self._indices_of_rank(len(ds))
	for source, ds in enumerate(dataset.datasets)
	}

	def _infinite_indices(self, sample_size: int) -> Iterator[int]:
	"""Infinitely yield a sequence of indices."""
	g = torch.Generator()
	g.manual_seed(self.seed)
	while True:
	if self.shuffle:
	yield from torch.randperm(sample_size, generator=g).tolist()
	else:
	yield from torch.arange(sample_size).tolist()

	def _indices_of_rank(self, sample_size: int) -> Iterator[int]:
	"""Slice the infinite indices by rank."""
	yield from itertools.islice(
	self._infinite_indices(sample_size), self.rank, None,
	self.world_size)

	def __iter__(self) -> Iterator[int]:
	batch_buffer = []
	while True:
	for source, num in enumerate(self.num_per_source):
	batch_buffer_per_source = []
	for idx in self.source2inds[source]:
	idx += self.cumulative_sizes[source]
	batch_buffer_per_source.append(idx)
	if len(batch_buffer_per_source) == num:
	batch_buffer += batch_buffer_per_source
	break
	yield from batch_buffer
	batch_buffer = []

	def __len__(self) -> int:
	return len(self.dataset)

	def set_epoch(self, epoch: int) -> None:
	"""Not supported in `epoch-based runner."""
	pass


	@DATA_SAMPLERS.register_module()
	class GroupMultiSourceSampler(MultiSourceSampler):
	r"""Group Multi-Source Infinite Sampler.

	According to the sampling ratio, sample data from different
	datasets but the same group to form batches.

	Args:
	dataset (Sized): The dataset.
	batch_size (int): Size of mini-batch.
	source_ratio (list[int \| float]): The sampling ratio of different
	source datasets in a mini-batch.
	shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
	seed (int, optional): Random seed. If None, set a random seed.
	Defaults to None.
	"""

	def __init__(self,
	dataset: BaseDataset,
	batch_size: int,
	source_ratio: List[Union[int, float]],
	shuffle: bool = True,
	seed: Optional[int] = None) -> None:
	super().__init__(
	dataset=dataset,
	batch_size=batch_size,
	source_ratio=source_ratio,
	shuffle=shuffle,
	seed=seed)

	self._get_source_group_info()
	self.group_source2inds = [{
	source:
	self._indices_of_rank(self.group2size_per_source[source][group])
	for source in range(len(dataset.datasets))
	} for group in range(len(self.group_ratio))]

	def _get_source_group_info(self) -> None:
	self.group2size_per_source = [{0: 0, 1: 0}, {0: 0, 1: 0}]
	self.group2inds_per_source = [{0: [], 1: []}, {0: [], 1: []}]
	for source, dataset in enumerate(self.dataset.datasets):
	for idx in range(len(dataset)):
	data_info = dataset.get_data_info(idx)
	width, height = data_info['width'], data_info['height']
	group = 0 if width < height else 1
	self.group2size_per_source[source][group] += 1
	self.group2inds_per_source[source][group].append(idx)

	self.group_sizes = np.zeros(2, dtype=np.int64)
	for group2size in self.group2size_per_source:
	for group, size in group2size.items():
	self.group_sizes[group] += size
	self.group_ratio = self.group_sizes / sum(self.group_sizes)

	def __iter__(self) -> Iterator[int]:
	batch_buffer = []
	while True:
	group = np.random.choice(
	list(range(len(self.group_ratio))), p=self.group_ratio)
	for source, num in enumerate(self.num_per_source):
	batch_buffer_per_source = []
	for idx in self.group_source2inds[group][source]:
	idx = self.group2inds_per_source[source][group][
	idx] + self.cumulative_sizes[source]
	batch_buffer_per_source.append(idx)
	if len(batch_buffer_per_source) == num:
	batch_buffer += batch_buffer_per_source
	break
	yield from batch_buffer
	batch_buffer = []