| """Audio diffusion model classes.""" | |
| import torch | |
| from torch import Tensor, nn | |
| from .utils import groupby | |
| from .sampler import UniformDistribution | |
| class LinearSchedule(nn.Module): | |
| def forward(self, num_steps: int, device) -> Tensor: | |
| sigmas = torch.linspace(1, 0, num_steps + 1)[:-1] | |
| return sigmas | |
| class VSampler(nn.Module): | |
| pass | |
| class Model1d(nn.Module): | |
| def __init__(self, unet_type: str = "base", **kwargs): | |
| super().__init__() | |
| diffusion_kwargs, kwargs = groupby("diffusion_", kwargs) | |
| self.unet = None | |
| self.diffusion = None | |
| def forward(self, x: Tensor, **kwargs) -> Tensor: | |
| return self.diffusion(x, **kwargs) | |
| def sample(self, *args, **kwargs) -> Tensor: | |
| return self.diffusion.sample(*args, **kwargs) | |
| def get_default_model_kwargs(): | |
| return dict( | |
| channels=128, | |
| patch_size=16, | |
| multipliers=[1, 2, 4, 4, 4, 4, 4], | |
| factors=[4, 4, 4, 2, 2, 2], | |
| num_blocks=[2, 2, 2, 2, 2, 2], | |
| attentions=[0, 0, 0, 1, 1, 1, 1], | |
| attention_heads=8, | |
| attention_features=64, | |
| attention_multiplier=2, | |
| attention_use_rel_pos=False, | |
| diffusion_type="v", | |
| diffusion_sigma_distribution=UniformDistribution(), | |
| ) | |
| def get_default_sampling_kwargs(): | |
| return dict(sigma_schedule=LinearSchedule(), sampler=VSampler(), clamp=True) | |
| class AudioDiffusionConditional(Model1d): | |
| def __init__(self, embedding_features: int, embedding_max_length: int, embedding_mask_proba: float = 0.1, **kwargs): | |
| self.embedding_mask_proba = embedding_mask_proba | |
| default_kwargs = dict( | |
| **get_default_model_kwargs(), | |
| unet_type="cfg", | |
| context_embedding_features=embedding_features, | |
| context_embedding_max_length=embedding_max_length, | |
| ) | |
| super().__init__(**{**default_kwargs, **kwargs}) | |
| def forward(self, *args, **kwargs): | |
| default_kwargs = dict(embedding_mask_proba=self.embedding_mask_proba) | |
| return super().forward(*args, **{**default_kwargs, **kwargs}) | |
| def sample(self, *args, **kwargs): | |
| default_kwargs = dict(**get_default_sampling_kwargs(), embedding_scale=5.0) | |
| return super().sample(*args, **{**default_kwargs, **kwargs}) | |