zhouyik's picture
Upload folder using huggingface_hub
032e687 verified
import torch
from torch.utils.data import Sampler, BatchSampler
from typing import Iterator, Optional, Sized
from mmengine.dist import get_dist_info
class MultiDataPseudoSampler(Sampler):
def __init__(self, dataset, seed=None, round_up=True):
self.dataset = dataset
def __iter__(self) -> Iterator[int]:
pass
def __len__(self) -> int:
return self.num_samples
def set_epoch(self, epoch: int) -> None:
self.epoch = epoch
class MultiDataSameBatchSampler(BatchSampler):
def __init__(self, sampler, batch_size: int, drop_last: bool = True):
self.sampler = sampler
self.batch_size = batch_size
self.drop_last = drop_last
rank, world_size = get_dist_info()
self.world_size = world_size
self.rank = rank
self.dataset = sampler.dataset
total_batches = 0
for start, end in zip([0] + self.dataset.cumulative_sizes[:-1], self.dataset.cumulative_sizes):
total_batches_ = (end - start) // self.batch_size
total_batches += total_batches_
self.num_samples = total_batches // self.world_size * self.batch_size
self.total_size = self.num_samples * self.world_size
self.epoch = 0
def __iter__(self):
indices = self._shuffle()
indices = indices[self.rank:self.total_size // self.batch_size:self.world_size]
assert len(indices) * self.batch_size == self.num_samples
return iter(indices)
def _shuffle(self):
g = torch.Generator()
g.manual_seed(42 + self.epoch)
indices = []
for start, end in zip([0] + self.dataset.cumulative_sizes[:-1], self.dataset.cumulative_sizes):
indices_ = torch.randperm(end-start, generator=g) + start
if len(indices_) % self.batch_size:
indices_ = indices_[:-(len(indices_) % self.batch_size)]
indices_ = indices_.view(-1, self.batch_size)
indices += indices_
indices = torch.stack(indices)
indices = indices[torch.randperm(len(indices), generator=g)]
return indices.tolist()
def __len__(self) -> int:
return self.num_samples // self.batch_size
def set_epoch(self, epoch):
self.epoch = epoch