tky823
/

MusicFM

+# MIT License
+#
+# Copyright 2023 ByteDance Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+import random
+import torch
+import torchaudio
+from einops import rearrange
+from torch import einsum, nn
+from torch.nn.common_types import _size_2_t
+from transformers import PreTrainedModel
+from .configuration_musicfm import MusicFMConfig
+class MusicFM25Hz(PreTrainedModel):
+    config_class = MusicFMConfig
+    def __init__(self, config: MusicFMConfig) -> None:
+        super().__init__(config)
+        # global variables
+        self.num_codebooks = config.num_codebooks
+        self.codebook_dim = config.codebook_dim
+        self.codebook_size = config.codebook_size
+        self.features = config.features
+        self.hop_length = config.hop_length
+        self.n_mels = config.n_mels
+        self.conv_dim = config.conv_dim
+        self.encoder_dim = config.encoder_dim
+        self.encoder_depth = config.encoder_depth
+        self.mask_hop = config.mask_hop
+        self.mask_prob = config.mask_prob
+        self.is_flash = config.is_flash
+        self.stat = config.stat
+        # feature extractor
+        self.preprocessor_melspec_2048 = MelSTFT(
+            n_fft=2048, hop_length=self.hop_length, is_db=True
+        )
+        # random quantizer
+        seed = 142
+        for feature in self.features:
+            for i in range(self.num_codebooks):
+                setattr(
+                    self,
+                    f"quantizer_{feature}_{i}",
+                    RandomProjectionQuantizer(
+                        self.n_mels * 4,
+                        self.codebook_dim,
+                        self.codebook_size,
+                        seed=seed + i,
+                    ),
+                )
+        # two residual convolution layers + one projection layer
+        self.conv = Conv2dSubsampling(
+            1, self.conv_dim, self.encoder_dim, strides=[2, 2], n_bands=self.n_mels
+        )
+        # Conformer
+        if config.is_flash:
+            from .flash_conformer import (
+                Wav2Vec2ConformerConfig,
+                Wav2Vec2ConformerEncoder,
+            )
+        else:
+            from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
+                Wav2Vec2ConformerConfig,
+                Wav2Vec2ConformerEncoder,
+            )
+        conformer_config = Wav2Vec2ConformerConfig.from_pretrained(
+            "facebook/wav2vec2-conformer-rope-large-960h-ft"
+        )
+        conformer_config.num_hidden_layers = self.encoder_depth
+        conformer_config.hidden_size = self.encoder_dim
+        self.conformer = Wav2Vec2ConformerEncoder(conformer_config)
+        # projection
+        self.linear = nn.Linear(self.encoder_dim, self.codebook_size)
+        # loss function
+        self.loss = nn.CrossEntropyLoss()
+        # cls token (used for sequence classification)
+        random.seed(seed)
+        self.cls_token = nn.Parameter(torch.randn(self.encoder_dim))
+    def masking(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.LongTensor]:
+        """random masking of 400ms with given probability"""
+        mx = x.clone()
+        b, t = mx.shape
+        len_masking_raw = int(24000 * self.mask_hop)
+        len_masking_token = int(24000 / self.hop_length / 2 / 2 * self.mask_hop)
+        # get random mask indices
+        start_indices = torch.rand(b, t // len_masking_raw) < self.mask_prob
+        time_domain_masked_indices = torch.nonzero(
+            start_indices.repeat_interleave(len_masking_raw, dim=1)
+        )
+        token_domain_masked_indices = torch.nonzero(
+            start_indices.repeat_interleave(len_masking_token, dim=1)
+        )
+        # mask with random values
+        masking_noise = (
+            torch.randn(time_domain_masked_indices.shape[0], dtype=x.dtype) * 0.1
+        )  # 0 mean 0.1 std
+        mx[tuple(time_domain_masked_indices.t())] = masking_noise.to(x.device)
+        return mx, token_domain_masked_indices
+    @torch.no_grad()
+    def preprocessing(
+        self, x: torch.Tensor, features: dict[str, torch.Tensor]
+    ) -> dict[str, torch.Tensor]:
+        """extract classic audio features"""
+        # check precision
+        if x.dtype == torch.float16:
+            precision = 16
+        else:
+            precision = 32
+        out = {}
+        for key in features:
+            layer = getattr(self, "preprocessor_%s" % key)
+            out[key] = layer.float()(x.float())[..., :-1]
+            if precision == 16:
+                out[key] = out[key].half()
+        return out
+    def encoder(self, x: torch.Tensor) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        """2-layer conv + w2v-conformer"""
+        x = self.conv(x)
+        out = self.conformer(x, output_hidden_states=True)
+        hidden_emb = out["hidden_states"]
+        last_emb = out["last_hidden_state"]
+        logits = self.linear(last_emb)
+        logits = {
+            key: logits[:, :, i * self.codebook_size : (i + 1) * self.codebook_size]
+            for i, key in enumerate(self.features)
+        }
+        return logits, hidden_emb
+    @torch.no_grad()
+    def normalize(self, x: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        """normalize the input audio to have zero mean unit variance"""
+        for key in x.keys():
+            x[key] = (x[key] - self.stat["%s_mean" % key]) / self.stat["%s_std" % key]
+        return x
+    @torch.no_grad()
+    def rearrange(self, x: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        """rearrange the batch to flatten every 4 steps"""
+        for key in x.keys():
+            if key == "chromagram":
+                x[key] = rearrange(x[key], "b f t -> b t f")
+            else:
+                x[key] = rearrange(x[key], "b f (t s) -> b t (s f)", s=4)
+        return x
+    @torch.no_grad()
+    def tokenize(self, x: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        out = {}
+        for key in x.keys():
+            layer = getattr(self, "quantizer_%s" % key)
+            out[key] = layer(x[key])
+        return out
+    def get_targets(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
+        x = self.preprocessing(x, features=self.features)
+        x = self.normalize(x)
+        x = self.rearrange(x)
+        target_tokens = self.tokenize(x)
+        return target_tokens
+    def get_predictions(
+        self, x: torch.Tensor
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor]:
+        # preprocessing
+        x = self.preprocessing(x, features=["melspec_2048"])
+        x = self.normalize(x)
+        # encoding
+        logits, hidden_emb = self.encoder(x["melspec_2048"])
+        return logits, hidden_emb
+    def get_latent(self, x: torch.Tensor, layer_ix: int = 12) -> torch.Tensor:
+        _, hidden_states = self.get_predictions(x)
+        emb = hidden_states[layer_ix]
+        return emb
+    def get_loss(
+        self,
+        logits: dict[str, torch.Tensor],
+        target_tokens: dict[str, torch.Tensor],
+        masked_indices: torch.LongTensor,
+    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+        losses = {}
+        accuracies = {}
+        for key in logits.keys():
+            masked_logits = logits[key][tuple(masked_indices.t())]
+            masked_tokens = target_tokens[key][tuple(masked_indices.t())]
+            losses[key] = self.loss(masked_logits, masked_tokens)
+            accuracies[key] = (
+                torch.sum(masked_logits.argmax(-1) == masked_tokens)
+                / masked_tokens.numel()
+            )
+        return losses, accuracies
+    def forward(
+        self, x: torch.Tensor
+    ) -> tuple[
+        dict[str, torch.Tensor],
+        torch.Tensor,
+        dict[str, torch.Tensor],
+        dict[str, torch.Tensor],
+    ]:
+        # get target feature tokens
+        target_tokens = self.get_targets(x)
+        # masking
+        x, masked_indices = self.masking(x)
+        # forward
+        logits, hidden_emb = self.get_predictions(x)
+        # get loss
+        losses, accuracies = self.get_loss(logits, target_tokens, masked_indices)
+        return logits, hidden_emb, losses, accuracies
+class MelSTFT(nn.Module):
+    def __init__(
+        self,
+        sample_rate: int = 24000,
+        n_fft: int = 2048,
+        hop_length: int = 240,
+        n_mels: int = 128,
+        is_db: bool = False,
+    ) -> None:
+        super().__init__()
+        # spectrogram
+        self.mel_stft = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
+        )
+        # amplitude to decibel
+        self.is_db = is_db
+        if is_db:
+            self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        if self.is_db:
+            return self.amplitude_to_db(self.mel_stft(waveform))
+        else:
+            return self.mel_stft(waveform)
+class RandomProjectionQuantizer(nn.Module):
+    """
+    Random projection and codebook lookup module
+    Some code is borrowed from:
+     https://github.com/lucidrains/vector-quantize-pytorch/blob/master/vector_quantize_pytorch/random_projection_quantizer.py
+    But I did normalization using pre-computed global mean & variance instead of using layer norm.
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        codebook_dim: int,
+        codebook_size: int,
+        seed: int = 142,
+    ) -> None:
+        super().__init__()
+        # random seed
+        torch.manual_seed(seed)
+        # randomly initialized projection
+        random_projection = torch.empty(input_dim, codebook_dim)
+        nn.init.xavier_normal_(random_projection)
+        self.register_buffer("random_projection", random_projection)
+        # randomly initialized codebook
+        codebook = torch.empty(codebook_size, codebook_dim)
+        nn.init.normal_(codebook)
+        self.register_buffer("codebook", codebook)
+    def codebook_lookup(self, x: torch.Tensor) -> torch.Tensor:
+        # reshape
+        b = x.shape[0]
+        x = rearrange(x, "b n e -> (b n) e")
+        # L2 normalization
+        normalized_x = nn.functional.normalize(x, dim=1, p=2)
+        normalized_codebook = nn.functional.normalize(self.codebook, dim=1, p=2)
+        # compute distances
+        distances = torch.cdist(normalized_codebook, normalized_x)
+        # get nearest
+        nearest_indices = torch.argmin(distances, dim=0)
+        # reshape
+        xq = rearrange(nearest_indices, "(b n) -> b n", b=b)
+        return xq
+    @torch.no_grad()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # always eval
+        self.eval()
+        # random projection [batch, length, input_dim] -> [batch, length, codebook_dim]
+        x = einsum("b n d, d e -> b n e", x, self.random_projection)
+        # codebook lookup
+        xq = self.codebook_lookup(x)
+        return xq
+class Res2dModule(nn.Module):
+    def __init__(self, idim: int, odim: int, stride: _size_2_t = (2, 2)) -> None:
+        super().__init__()
+        self.conv1 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
+        self.bn1 = nn.BatchNorm2d(odim)
+        self.conv2 = nn.Conv2d(odim, odim, 3, padding=1)
+        self.bn2 = nn.BatchNorm2d(odim)
+        self.relu = nn.ReLU()
+        # residual
+        self.diff = False
+        if (idim != odim) or (stride[0] > 1):
+            self.conv3 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
+            self.bn3 = nn.BatchNorm2d(odim)
+            self.diff = True
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = self.bn2(self.conv2(self.relu(self.bn1(self.conv1(x)))))
+        if self.diff:
+            x = self.bn3(self.conv3(x))
+        out = x + out
+        out = self.relu(out)
+        return out
+class Conv2dSubsampling(nn.Module):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Args:
+        idim (int): Input dimension.
+        hdim (int): Hidden dimension.
+        odim (int): Output dimension.
+        strides (list): Sizes of strides.
+        n_bands (int): Number of frequency bands.
+    """
+    def __init__(
+        self,
+        idim: int,
+        hdim: int,
+        odim: int,
+        strides: list[int] = [2, 2],
+        n_bands: int = 64,
+    ) -> None:
+        """Construct an Conv2dSubsampling object."""
+        super().__init__()
+        self.conv = nn.Sequential(
+            Res2dModule(idim, hdim, (2, strides[0])),
+            Res2dModule(hdim, hdim, (2, strides[1])),
+        )
+        self.linear = nn.Linear(hdim * n_bands // 2 // 2, odim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, idim, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+        """
+        if x.dim() == 3:
+            x = x.unsqueeze(1)  # (b, c, f, t)
+        x = self.conv(x)
+        x = rearrange(x, "b c f t -> b t (c f)")
+        x = self.linear(x)
+        return x