Spaces:
Runtime error
Runtime error
| import numpy as np | |
| import torch | |
| from diffusers import ConfigMixin, Mel, ModelMixin | |
| from torch import nn | |
| class SeparableConv2d(nn.Module): | |
| def __init__(self, in_channels, out_channels, kernel_size): | |
| super(SeparableConv2d, self).__init__() | |
| self.depthwise = nn.Conv2d( | |
| in_channels, | |
| in_channels, | |
| kernel_size=kernel_size, | |
| groups=in_channels, | |
| bias=False, | |
| padding=1, | |
| ) | |
| self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=True) | |
| def forward(self, x): | |
| out = self.depthwise(x) | |
| out = self.pointwise(out) | |
| return out | |
| class ConvBlock(nn.Module): | |
| def __init__(self, in_channels, out_channels, dropout_rate): | |
| super(ConvBlock, self).__init__() | |
| self.sep_conv = SeparableConv2d(in_channels, out_channels, (3, 3)) | |
| self.leaky_relu = nn.LeakyReLU(0.2) | |
| self.batch_norm = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.01) | |
| self.max_pool = nn.MaxPool2d((2, 2)) | |
| self.dropout = nn.Dropout(dropout_rate) | |
| def forward(self, x): | |
| x = self.sep_conv(x) | |
| x = self.leaky_relu(x) | |
| x = self.batch_norm(x) | |
| x = self.max_pool(x) | |
| x = self.dropout(x) | |
| return x | |
| class DenseBlock(nn.Module): | |
| def __init__(self, in_features, out_features, dropout_rate): | |
| super(DenseBlock, self).__init__() | |
| self.flatten = nn.Flatten() | |
| self.dense = nn.Linear(in_features, out_features) | |
| self.leaky_relu = nn.LeakyReLU(0.2) | |
| self.batch_norm = nn.BatchNorm1d(out_features, eps=0.001, momentum=0.01) | |
| self.dropout = nn.Dropout(dropout_rate) | |
| def forward(self, x): | |
| x = self.flatten(x.permute(0, 2, 3, 1)) | |
| x = self.dense(x) | |
| x = self.leaky_relu(x) | |
| x = self.batch_norm(x) | |
| x = self.dropout(x) | |
| return x | |
| class AudioEncoder(ModelMixin, ConfigMixin): | |
| def __init__(self): | |
| super().__init__() | |
| self.mel = Mel( | |
| x_res=216, | |
| y_res=96, | |
| sample_rate=22050, | |
| n_fft=2048, | |
| hop_length=512, | |
| top_db=80, | |
| ) | |
| self.conv_blocks = nn.ModuleList([ConvBlock(1, 32, 0.2), ConvBlock(32, 64, 0.3), ConvBlock(64, 128, 0.4)]) | |
| self.dense_block = DenseBlock(41472, 1024, 0.5) | |
| self.embedding = nn.Linear(1024, 100) | |
| def forward(self, x): | |
| for conv_block in self.conv_blocks: | |
| x = conv_block(x) | |
| x = self.dense_block(x) | |
| x = self.embedding(x) | |
| return x | |
| def encode(self, audio_files): | |
| self.eval() | |
| y = [] | |
| for audio_file in audio_files: | |
| self.mel.load_audio(audio_file) | |
| x = [ | |
| np.expand_dims( | |
| np.frombuffer(self.mel.audio_slice_to_image(slice).tobytes(), dtype="uint8").reshape( | |
| (self.mel.y_res, self.mel.x_res) | |
| ) | |
| / 255, | |
| axis=0, | |
| ) | |
| for slice in range(self.mel.get_number_of_slices()) | |
| ] | |
| y += [torch.mean(self(torch.Tensor(x)), dim=0)] | |
| return torch.stack(y) | |
| # from diffusers import Mel | |
| # from audiodiffusion.audio_encoder import AudioEncoder | |
| # audio_encoder = AudioEncoder.from_pretrained("teticio/audio-encoder") | |
| # audio_encoder.encode(['/home/teticio/Music/liked/Agua Re - Holy Dance - Large Sound Mix.mp3']) | |