| from abc import ABC, abstractmethod
|
|
|
| import numpy as np
|
| import torch as th
|
| import torch.distributed as dist
|
|
|
|
|
| def create_named_schedule_sampler(name, diffusion, maxt):
|
| """
|
| Create a ScheduleSampler from a library of pre-defined samplers.
|
|
|
| :param name: the name of the sampler.
|
| :param diffusion: the diffusion object to sample for.
|
| """
|
| if name == "uniform":
|
| return UniformSampler(diffusion, maxt)
|
| elif name == "loss-second-moment":
|
| return LossSecondMomentResampler(diffusion)
|
| else:
|
| raise NotImplementedError(f"unknown schedule sampler: {name}")
|
|
|
|
|
| class ScheduleSampler(ABC):
|
| """
|
| A distribution over timesteps in the diffusion process, intended to reduce
|
| variance of the objective.
|
|
|
| By default, samplers perform unbiased importance sampling, in which the
|
| objective's mean is unchanged.
|
| However, subclasses may override sample() to change how the resampled
|
| terms are reweighted, allowing for actual changes in the objective.
|
| """
|
|
|
| @abstractmethod
|
| def weights(self):
|
| """
|
| Get a numpy array of weights, one per diffusion step.
|
|
|
| The weights needn't be normalized, but must be positive.
|
| """
|
|
|
| def sample(self, batch_size, device):
|
| """
|
| Importance-sample timesteps for a batch.
|
|
|
| :param batch_size: the number of timesteps.
|
| :param device: the torch device to save to.
|
| :return: a tuple (timesteps, weights):
|
| - timesteps: a tensor of timestep indices.
|
| - weights: a tensor of weights to scale the resulting losses.
|
| """
|
| w = self.weights()
|
| p = w / np.sum(w)
|
| indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
|
| indices = th.from_numpy(indices_np).long().to(device)
|
| weights_np = 1 / (len(p) * p[indices_np])
|
| weights = th.from_numpy(weights_np).float().to(device)
|
| return indices, weights
|
|
|
|
|
| class UniformSampler(ScheduleSampler):
|
| def __init__(self, diffusion, maxt):
|
| self.diffusion = diffusion
|
| self._weights = np.ones([maxt])
|
|
|
| def weights(self):
|
| return self._weights
|
|
|
|
|
| class LossAwareSampler(ScheduleSampler):
|
| def update_with_local_losses(self, local_ts, local_losses):
|
| """
|
| Update the reweighting using losses from a model.
|
|
|
| Call this method from each rank with a batch of timesteps and the
|
| corresponding losses for each of those timesteps.
|
| This method will perform synchronization to make sure all of the ranks
|
| maintain the exact same reweighting.
|
|
|
| :param local_ts: an integer Tensor of timesteps.
|
| :param local_losses: a 1D Tensor of losses.
|
| """
|
| batch_sizes = [
|
| th.tensor([0], dtype=th.int32, device=local_ts.device)
|
| for _ in range(dist.get_world_size())
|
| ]
|
| dist.all_gather(
|
| batch_sizes,
|
| th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
|
| )
|
|
|
|
|
| batch_sizes = [x.item() for x in batch_sizes]
|
| max_bs = max(batch_sizes)
|
|
|
| timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
|
| loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
|
| dist.all_gather(timestep_batches, local_ts)
|
| dist.all_gather(loss_batches, local_losses)
|
| timesteps = [
|
| x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
|
| ]
|
| losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
|
| self.update_with_all_losses(timesteps, losses)
|
|
|
| @abstractmethod
|
| def update_with_all_losses(self, ts, losses):
|
| """
|
| Update the reweighting using losses from a model.
|
|
|
| Sub-classes should override this method to update the reweighting
|
| using losses from the model.
|
|
|
| This method directly updates the reweighting without synchronizing
|
| between workers. It is called by update_with_local_losses from all
|
| ranks with identical arguments. Thus, it should have deterministic
|
| behavior to maintain state across workers.
|
|
|
| :param ts: a list of int timesteps.
|
| :param losses: a list of float losses, one per timestep.
|
| """
|
|
|
|
|
| class LossSecondMomentResampler(LossAwareSampler):
|
| def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
|
| self.diffusion = diffusion
|
| self.history_per_term = history_per_term
|
| self.uniform_prob = uniform_prob
|
| self._loss_history = np.zeros(
|
| [diffusion.num_timesteps, history_per_term], dtype=np.float64
|
| )
|
| self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
|
|
|
| def weights(self):
|
| if not self._warmed_up():
|
| return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
|
| weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
|
| weights /= np.sum(weights)
|
| weights *= 1 - self.uniform_prob
|
| weights += self.uniform_prob / len(weights)
|
| return weights
|
|
|
| def update_with_all_losses(self, ts, losses):
|
| for t, loss in zip(ts, losses):
|
| if self._loss_counts[t] == self.history_per_term:
|
|
|
| self._loss_history[t, :-1] = self._loss_history[t, 1:]
|
| self._loss_history[t, -1] = loss
|
| else:
|
| self._loss_history[t, self._loss_counts[t]] = loss
|
| self._loss_counts[t] += 1
|
|
|
| def _warmed_up(self):
|
| return (self._loss_counts == self.history_per_term).all()
|
|
|