Spaces:
Running on Zero
Running on Zero
| from abc import abstractmethod, ABC | |
| import torch | |
| class SchedulerInterface(ABC): | |
| """ | |
| Base class for diffusion noise schedule. | |
| """ | |
| alphas_cumprod: torch.Tensor # [T], alphas for defining the noise schedule | |
| def add_noise( | |
| self, clean_latent: torch.Tensor, | |
| noise: torch.Tensor, timestep: torch.Tensor | |
| ): | |
| """ | |
| Diffusion forward corruption process. | |
| Input: | |
| - clean_latent: the clean latent with shape [B, C, H, W] | |
| - noise: the noise with shape [B, C, H, W] | |
| - timestep: the timestep with shape [B] | |
| Output: the corrupted latent with shape [B, C, H, W] | |
| """ | |
| pass | |
| def convert_x0_to_noise( | |
| self, x0: torch.Tensor, xt: torch.Tensor, | |
| timestep: torch.Tensor | |
| ) -> torch.Tensor: | |
| """ | |
| Convert the diffusion network's x0 prediction to noise predidction. | |
| x0: the predicted clean data with shape [B, C, H, W] | |
| xt: the input noisy data with shape [B, C, H, W] | |
| timestep: the timestep with shape [B] | |
| noise = (xt-sqrt(alpha_t)*x0) / sqrt(beta_t) (eq 11 in https://arxiv.org/abs/2311.18828) | |
| """ | |
| # use higher precision for calculations | |
| original_dtype = x0.dtype | |
| x0, xt, alphas_cumprod = map( | |
| lambda x: x.double().to(x0.device), [x0, xt, | |
| self.alphas_cumprod] | |
| ) | |
| alpha_prod_t = alphas_cumprod[timestep].reshape(-1, 1, 1, 1) | |
| beta_prod_t = 1 - alpha_prod_t | |
| noise_pred = (xt - alpha_prod_t ** | |
| (0.5) * x0) / beta_prod_t ** (0.5) | |
| return noise_pred.to(original_dtype) | |
| def convert_noise_to_x0( | |
| self, noise: torch.Tensor, xt: torch.Tensor, | |
| timestep: torch.Tensor | |
| ) -> torch.Tensor: | |
| """ | |
| Convert the diffusion network's noise prediction to x0 predidction. | |
| noise: the predicted noise with shape [B, C, H, W] | |
| xt: the input noisy data with shape [B, C, H, W] | |
| timestep: the timestep with shape [B] | |
| x0 = (x_t - sqrt(beta_t) * noise) / sqrt(alpha_t) (eq 11 in https://arxiv.org/abs/2311.18828) | |
| """ | |
| # use higher precision for calculations | |
| original_dtype = noise.dtype | |
| noise, xt, alphas_cumprod = map( | |
| lambda x: x.double().to(noise.device), [noise, xt, | |
| self.alphas_cumprod] | |
| ) | |
| alpha_prod_t = alphas_cumprod[timestep].reshape(-1, 1, 1, 1) | |
| beta_prod_t = 1 - alpha_prod_t | |
| x0_pred = (xt - beta_prod_t ** | |
| (0.5) * noise) / alpha_prod_t ** (0.5) | |
| return x0_pred.to(original_dtype) | |
| def convert_velocity_to_x0( | |
| self, velocity: torch.Tensor, xt: torch.Tensor, | |
| timestep: torch.Tensor | |
| ) -> torch.Tensor: | |
| """ | |
| Convert the diffusion network's velocity prediction to x0 predidction. | |
| velocity: the predicted noise with shape [B, C, H, W] | |
| xt: the input noisy data with shape [B, C, H, W] | |
| timestep: the timestep with shape [B] | |
| v = sqrt(alpha_t) * noise - sqrt(beta_t) x0 | |
| noise = (xt-sqrt(alpha_t)*x0) / sqrt(beta_t) | |
| given v, x_t, we have | |
| x0 = sqrt(alpha_t) * x_t - sqrt(beta_t) * v | |
| see derivations https://chatgpt.com/share/679fb6c8-3a30-8008-9b0e-d1ae892dac56 | |
| """ | |
| # use higher precision for calculations | |
| original_dtype = velocity.dtype | |
| velocity, xt, alphas_cumprod = map( | |
| lambda x: x.double().to(velocity.device), [velocity, xt, | |
| self.alphas_cumprod] | |
| ) | |
| alpha_prod_t = alphas_cumprod[timestep].reshape(-1, 1, 1, 1) | |
| beta_prod_t = 1 - alpha_prod_t | |
| x0_pred = (alpha_prod_t ** 0.5) * xt - (beta_prod_t ** 0.5) * velocity | |
| return x0_pred.to(original_dtype) | |