Spaces:

amphion
/

AnyAccomp

Running on Zero

App Files Files Community

viewfinder-annn commited on Sep 21, 2025

Commit

85651ad

verified ·

1 Parent(s): 50ec95a

Upload inference related files

Browse files

Files changed (23) hide show

.gitattributes +3 -0
anyaccomp/fmt_model.py +367 -0
anyaccomp/inference_utils.py +124 -0
anyaccomp/llama_nar.py +667 -0
config/flow_matching.json +74 -0
config/vocoder.json +52 -0
example/gradio/example1.mp3 +3 -0
example/gradio/example2.wav +3 -0
example/gradio/example3.wav +3 -0
models/__init__.py +0 -0
models/codec/__init__.py +0 -0
models/codec/amphion_codec/.DS_Store +0 -0
models/codec/amphion_codec/quantize/__init__.py +11 -0
models/codec/amphion_codec/quantize/factorized_vector_quantize.py +150 -0
models/codec/amphion_codec/quantize/lookup_free_quantize.py +77 -0
models/codec/amphion_codec/quantize/residual_vq.py +177 -0
models/codec/amphion_codec/quantize/vector_quantize.py +401 -0
models/codec/amphion_codec/vocos.py +881 -0
models/codec/coco/rep_coco_model.py +441 -0
models/codec/melvqgan/melspec.py +108 -0
utils/__init__.py +0 -0
utils/hparam.py +659 -0
utils/util.py +690 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+example/gradio/example1.mp3 filter=lfs diff=lfs merge=lfs -text
+example/gradio/example2.wav filter=lfs diff=lfs merge=lfs -text
+example/gradio/example3.wav filter=lfs diff=lfs merge=lfs -text

anyaccomp/fmt_model.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import math
+from einops import rearrange
+from anyaccomp.llama_nar import DiffLlamaConcat
+import torch.nn.functional as F
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel
+from typing import List, Optional, Tuple, Union
+from transformers.models.llama.modeling_llama import BaseModelOutputWithPast
+class FlowMatchingTransformerConcat(nn.Module):
+    def __init__(
+        self,
+        vocab_size=1024,
+        mel_dim=100,
+        hidden_size=1024,
+        num_layers=12,
+        num_heads=16,
+        cfg_scale=0.2,
+        use_cond_code=False,
+        cond_codebook_size=1024,
+        cond_dim=1024,
+        cond_scale_factor=1,
+        sigma=1e-5,
+        time_scheduler="linear",
+        cfg=None,
+    ):
+        super().__init__()
+        self.cfg = cfg
+        mel_dim = (
+            cfg.mel_dim if cfg is not None and hasattr(cfg, "mel_dim") else mel_dim
+        )
+        hidden_size = (
+            cfg.hidden_size
+            if cfg is not None and hasattr(cfg, "hidden_size")
+            else hidden_size
+        )
+        num_layers = (
+            cfg.num_layers
+            if cfg is not None and hasattr(cfg, "num_layers")
+            else num_layers
+        )
+        num_heads = (
+            cfg.num_heads
+            if cfg is not None and hasattr(cfg, "num_heads")
+            else num_heads
+        )
+        cfg_scale = (
+            cfg.cfg_scale
+            if cfg is not None and hasattr(cfg, "cfg_scale")
+            else cfg_scale
+        )
+        use_cond_code = (
+            cfg.use_cond_code
+            if cfg is not None and hasattr(cfg, "use_cond_code")
+            else use_cond_code
+        )
+        cond_codebook_size = (
+            cfg.cond_codebook_size
+            if cfg is not None and hasattr(cfg, "cond_codebook_size")
+            else cond_codebook_size
+        )
+        cond_dim = (
+            cfg.cond_dim if cfg is not None and hasattr(cfg, "cond_dim") else cond_dim
+        )
+        time_scheduler = (
+            cfg.time_scheduler
+            if cfg is not None and hasattr(cfg, "time_scheduler")
+            else time_scheduler
+        )
+        sigma = cfg.sigma if cfg is not None and hasattr(cfg, "sigma") else sigma
+        cond_scale_factor = (
+            cfg.cond_scale_factor
+            if cfg is not None and hasattr(cfg, "cond_scale_factor")
+            else cond_scale_factor
+        )
+        self.mel_dim = mel_dim
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.cfg_scale = cfg_scale
+        self.use_cond_code = use_cond_code
+        self.cond_codebook_size = cond_codebook_size
+        self.cond_dim = cond_dim
+        self.time_scheduler = time_scheduler
+        self.sigma = sigma
+        self.cond_scale_factor = cond_scale_factor
+        self.vocab_size = (
+            cfg.vocab_size
+            if cfg is not None and hasattr(cfg, "vocab_size")
+            else vocab_size
+        )
+        self.vocal_mel_proj = (
+            nn.Linear(self.cfg.cond_code_dim, self.hidden_size)
+            if not self.use_cond_code
+            else nn.Sequential(
+                nn.Embedding(
+                    self.vocab_size, self.mel_dim
+                ),  # [batch] -> [batch, mel_dim]
+                nn.Linear(
+                    self.mel_dim, self.hidden_size
+                ),  # [batch, mel_dim] -> [batch, hidden_size]
+            )
+        )
+        self.diff_estimator = DiffLlamaConcat(
+            mel_dim=self.mel_dim,
+            hidden_size=self.hidden_size,
+            num_heads=self.num_heads,
+            num_layers=self.num_layers,
+            flash_attention=hasattr(cfg, "flash_attention") and cfg.flash_attention,
+        )
+        if hasattr(cfg, "repa_loss") and cfg.repa_loss.enable:
+            repa_dim = (
+                cfg.repa_loss.repa_dim
+                if hasattr(cfg.repa_loss, "repa_dim")
+                else self.hidden_size
+            )
+            self.repa_proj = nn.Sequential(
+                nn.Linear(self.hidden_size, self.hidden_size),
+                nn.SiLU(),
+                nn.Linear(self.hidden_size, self.hidden_size),
+                nn.SiLU(),
+                nn.Linear(self.hidden_size, repa_dim),
+            )
+        self.reset_parameters()
+    def reset_parameters(self):
+        def _reset_parameters(m):
+            if isinstance(m, nn.MultiheadAttention):
+                if m._qkv_same_embed_dim:
+                    nn.init.normal_(m.in_proj_weight, std=0.02)
+                else:
+                    nn.init.normal_(m.q_proj_weight, std=0.02)
+                    nn.init.normal_(m.k_proj_weight, std=0.02)
+                    nn.init.normal_(m.v_proj_weight, std=0.02)
+                if m.in_proj_bias is not None:
+                    nn.init.constant_(m.in_proj_bias, 0.0)
+                    nn.init.constant_(m.out_proj.bias, 0.0)
+                if m.bias_k is not None:
+                    nn.init.xavier_normal_(m.bias_k)
+                if m.bias_v is not None:
+                    nn.init.xavier_normal_(m.bias_v)
+            elif (
+                isinstance(m, nn.Conv1d)
+                or isinstance(m, nn.ConvTranspose1d)
+                or isinstance(m, nn.Conv2d)
+                or isinstance(m, nn.ConvTranspose2d)
+            ):
+                m.weight.data.normal_(0.0, 0.02)
+            elif isinstance(m, nn.Linear):
+                m.weight.data.normal_(mean=0.0, std=0.02)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.Embedding):
+                m.weight.data.normal_(mean=0.0, std=0.02)
+                if m.padding_idx is not None:
+                    m.weight.data[m.padding_idx].zero_()
+        self.apply(_reset_parameters)
+    @torch.no_grad()
+    def forward_diffusion(self, x, t):
+        """
+        x: (B, T, mel_dim)
+        t: (B,)
+        """
+        new_t = t
+        t = t.unsqueeze(-1).unsqueeze(-1)
+        z = torch.randn(
+            x.shape, dtype=x.dtype, device=x.device, requires_grad=False
+        )  # (B, T, mel_dim)
+        cfg_scale = self.cfg_scale
+        # get prompt len
+        if torch.rand(1) > 0.7:
+            prompt_len = torch.randint(
+                min(x.shape[1] // 4, 5), int(x.shape[1] * 0.4), (x.shape[0],)
+            ).to(
+                x.device
+            )  # (B,)
+        else:
+            prompt_len = torch.zeros(x.shape[0]).to(x.device)
+        split_ratio = torch.rand(prompt_len.shape, device=prompt_len.device)  # (B,)
+        left_len = (split_ratio * (prompt_len + 1).float()).long()  # (B,)
+        right_len = prompt_len - left_len  # (B,)
+        T = x.shape[1]
+        is_prompt = torch.zeros_like(x[:, :, 0])  # (B, T)
+        col_indices = torch.arange(T, device=x.device).repeat(x.shape[0], 1)  # (B, T)
+        left_mask = col_indices < left_len.unsqueeze(1)
+        right_mask = col_indices >= (T - right_len.unsqueeze(1))
+        is_prompt[left_mask | right_mask] = 1
+        mask = torch.ones_like(x[:, :, 0])  # mask if 1, not mask if 0
+        mask[is_prompt.bool()] = 0
+        mask = mask[:, :, None]
+        # flow matching: xt = (1 - (1 - sigma) * t) * x0 + t * x; where x0 ~ N(0, 1), x is a sample
+        # flow gt: x - (1 - sigma) * x0 = x - (1 - sigma) * noise
+        xt = ((1 - (1 - self.sigma) * t) * z + t * x) * mask + x * (1 - mask)
+        return xt, z, new_t, prompt_len, mask
+    def loss_t(
+        self,
+        x,
+        x_mask,
+        t,
+        lyric=None,
+        output_hidden_states=False,
+    ):
+        xt, z, new_t, prompt_len, mask = self.forward_diffusion(x, t)
+        noise = z
+        prompt_len = prompt_len.float()
+        # drop condition using cfg_scale
+        if lyric is not None:
+            cfg_mask = torch.where(
+                torch.rand_like(prompt_len) > self.cfg_scale,
+                torch.ones_like(prompt_len),  # keep cond
+                torch.zeros_like(prompt_len),  # drop cond
+            ).to(lyric.device)
+            cond_mask = cfg_mask[:, None, None]  # [b, 1, 1]
+            lyric = lyric * cond_mask
+        final_mask = mask * x_mask[..., None]  # (B, T, 1)
+        output = self.diff_estimator(
+            xt, new_t, x_mask, lyric, output_hidden_states=output_hidden_states
+        )
+        if output_hidden_states:
+            return_list = [noise, x, output["hidden_states"], final_mask, prompt_len]
+            return_list.append(output["all_hidden_states"])
+        else:
+            return_list = [noise, x, output, final_mask, prompt_len]
+        return return_list
+    def compute_loss(self, x, x_mask, lyric=None, output_hidden_states=False):
+        # x0: (B, T, num_quantizer)
+        # x_mask: (B, T) mask is 0 for padding
+        t = torch.rand(x.shape[0], device=x.device, requires_grad=False)
+        t = torch.clamp(t, 1e-5, 1.0)
+        # from CosyVoice: considering the generation process at the beginning is harder than follows, we involve a cosine scheduler for the timestep t
+        if self.time_scheduler == "cos":
+            t = 1 - torch.cos(t * math.pi * 0.5)
+        else:
+            pass
+        return self.loss_t(
+            x, x_mask, t, lyric, output_hidden_states=output_hidden_states
+        )
+    def forward(self, x, x_mask, vocal_mel, output_hidden_states=False):
+        cond = self.vocal_mel_proj(vocal_mel)
+        return self.compute_loss(x, x_mask, cond, output_hidden_states)
+    @torch.no_grad()
+    def reverse_diffusion(
+        self,
+        vocal_mel=None,
+        prompt=None,
+        right_prompt=None,
+        x_mask=None,
+        prompt_mask=None,
+        right_prompt_mask=None,
+        target_len=None,
+        n_timesteps=10,
+        cfg=1.0,
+        rescale_cfg=0.75,
+    ):
+        h = 1.0 / n_timesteps
+        prompt_len = prompt.shape[1] if prompt is not None else 0
+        right_prompt_len = right_prompt.shape[1] if right_prompt is not None else 0
+        # print(prompt_len, right_prompt_len)
+        if vocal_mel is not None:
+            target_len = vocal_mel.shape[1]
+        elif target_len is None:
+            target_len = 1000  # hardcode 50Hz 20s
+        else:
+            raise ValueError
+        full_len = target_len
+        target_len = target_len - prompt_len - right_prompt_len
+        cond = self.vocal_mel_proj(vocal_mel)
+        if x_mask is None:
+            x_mask = torch.ones(cond.shape[0], target_len).to(cond.device)
+        if prompt_mask is None and prompt is not None:
+            prompt_mask = torch.ones(cond.shape[0], prompt_len).to(cond.device)
+        if right_prompt_mask is None and right_prompt is not None:
+            right_prompt_mask = torch.ones(cond.shape[0], right_prompt_len).to(
+                cond.device
+            )
+        if prompt is not None and right_prompt is not None:
+            xt_mask = torch.cat([prompt_mask, x_mask, right_prompt_mask], dim=1)
+        elif prompt is not None and right_prompt is None:
+            xt_mask = torch.cat([prompt_mask, x_mask], dim=1)
+        elif prompt is None and right_prompt is not None:
+            xt_mask = torch.cat([x_mask, right_prompt_mask], dim=1)
+        else:
+            xt_mask = x_mask
+        z = torch.randn(
+            (cond.shape[0], target_len, self.mel_dim),
+            dtype=cond.dtype,
+            device=cond.device,
+            requires_grad=False,
+        )
+        xt = z
+        # t from 0 to 1: x0 = z ~ N(0, 1)
+        for i in range(n_timesteps):
+            if prompt is not None and right_prompt is not None:
+                xt_input = torch.cat([prompt, xt, right_prompt], dim=1)
+            elif prompt is not None and right_prompt is None:
+                xt_input = torch.cat([prompt, xt], dim=1)
+            elif prompt is None and right_prompt is not None:
+                xt_input = torch.cat([xt, right_prompt], dim=1)
+            else:
+                xt_input = xt
+            t = (0 + (i + 0.5) * h) * torch.ones(
+                z.shape[0], dtype=z.dtype, device=z.device
+            )
+            flow_pred = self.diff_estimator(xt_input, t, xt_mask, cond)
+            flow_pred = flow_pred[:, prompt_len : prompt_len + target_len, :]
+            # cfg
+            if cfg > 0:
+                uncond_flow_pred = self.diff_estimator(
+                    xt_input, t, xt_mask, torch.zeros_like(cond)
+                )
+                uncond_flow_pred = uncond_flow_pred[
+                    :, prompt_len : prompt_len + target_len, :
+                ]
+                pos_flow_pred_std = flow_pred.std()
+                flow_pred_cfg = flow_pred + cfg * (flow_pred - uncond_flow_pred)
+                rescale_flow_pred = (
+                    flow_pred_cfg * pos_flow_pred_std / flow_pred_cfg.std()
+                )
+                flow_pred = (
+                    rescale_cfg * rescale_flow_pred + (1 - rescale_cfg) * flow_pred_cfg
+                )
+            dxt = flow_pred * h
+            xt = xt + dxt
+        return xt

anyaccomp/inference_utils.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import math
+import json
+import librosa
+import torch
+import torchaudio
+import accelerate
+import safetensors
+import numpy as np
+import os
+import yaml
+import torchvision
+from librosa.feature import chroma_stft
+import torchvision
+import random
+import numpy as np
+import sys
+from anyaccomp.fmt_model import FlowMatchingTransformerConcat
+from models.codec.amphion_codec.vocos import Vocos
+from models.codec.melvqgan.melspec import MelSpectrogram
+from models.codec.coco.rep_coco_model import CocoContentStyle, CocoContent, CocoStyle
+from tqdm import tqdm
+from utils.util import load_config
+import io
+from transformers import T5Tokenizer, T5EncoderModel
+import warnings
+class Sing2SongInferencePipeline:
+    def __init__(
+        self,
+        checkpoint_path,
+        cfg_path,
+        vocoder_checkpoint_path,
+        vocoder_cfg_path,
+        device="cuda",
+    ):
+        self.cfg = load_config(cfg_path)
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self._load_model(checkpoint_path)
+        self._build_input_model()
+        self.vocoder_checkpoint_path = vocoder_checkpoint_path
+        self.vocoder_cfg = load_config(vocoder_cfg_path)
+        self._build_output_model()
+        print("Output model built")
+    def _load_model(self, checkpoint_path):
+        self.model = FlowMatchingTransformerConcat(
+            cfg=self.cfg.model.flow_matching_transformer
+        )
+        accelerate.load_checkpoint_and_dispatch(self.model, checkpoint_path)
+        self.model.eval().to(self.device)
+        print(
+            f"model Params: {round(sum(p.numel() for p in self.model.parameters() if p.requires_grad)/1e6, 2)}M"
+        )
+        print(f"Loaded model from {checkpoint_path}")
+    def _build_input_model(self):
+        self.coco_model = CocoStyle(
+            cfg=self.cfg.model.coco, construct_only_for_quantizer=True
+        )
+        self.coco_model.eval()
+        self.coco_model.to(self.device)
+        accelerate.load_checkpoint_and_dispatch(
+            self.coco_model, self.cfg.model.coco.pretrained_path
+        )
+    def _build_output_model(self):
+        # print(vocoder_checkpoint_path)
+        self.vocoder = Vocos(cfg=self.vocoder_cfg.model.vocos)
+        accelerate.load_checkpoint_and_dispatch(
+            self.vocoder, self.vocoder_checkpoint_path
+        )
+        self.vocoder = self.vocoder.eval().to(self.device)
+    @torch.no_grad()
+    @torch.cuda.amp.autocast(dtype=torch.bfloat16)
+    def _extract_coco_codec(self, speech):
+        """
+        Args:
+            speech: [B, T]
+        Returns:
+            codecs: [B, T]. Note that codecs might be not at 50Hz!
+        """
+        target_chroma_dim = self.cfg.model.coco.chromagram_dim
+        speech = speech.cpu().numpy().squeeze()
+        chromagram = chroma_stft(
+            y=speech,
+            sr=self.cfg.preprocess.chromagram.sample_rate,
+            n_fft=self.cfg.preprocess.chromagram.n_fft,
+            hop_length=self.cfg.preprocess.chromagram.hop_size,
+            win_length=self.cfg.preprocess.chromagram.win_size,
+            n_chroma=target_chroma_dim,
+        ).T  # [D, T] -> [T, D]
+        chromagram_feats = torch.tensor(chromagram).unsqueeze(0).to(self.device)
+        codecs, _ = self.coco_model.quantize(chromagram_feats)
+        return codecs
+    @torch.no_grad()
+    def encode_vocal(self, speech):  # (B, T)
+        speech = speech.to(self.device)
+        codecs = self._extract_coco_codec(speech)
+        return codecs
+    @torch.no_grad()
+    def _generate_audio(self, mel):
+        synthesized_audio = (self.vocoder(mel.transpose(1, 2)).detach().cpu())[0]
+        return synthesized_audio

anyaccomp/llama_nar.py ADDED Viewed

	@@ -0,0 +1,667 @@

+from transformers import LlamaConfig, LlamaForCausalLM, LlamaModel
+import torch
+import torch.nn.functional as F
+import numpy as np
+import os
+import torch.nn as nn
+from typing import List, Optional, Tuple, Union
+import math
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+from transformers.models.llama.modeling_llama import BaseModelOutputWithPast
+from transformers import LlamaConfig
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    apply_rotary_pos_emb,
+    Cache,
+    repeat_kv,
+)
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :] * 1.0
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class LlamaAdaptiveRMSNorm(nn.Module):
+    def __init__(self, hidden_size=1024, eps=1e-6, dim_cond=1024):
+        super().__init__()
+        self.to_weight = nn.Linear(dim_cond, hidden_size)
+        nn.init.zeros_(self.to_weight.weight)
+        nn.init.ones_(self.to_weight.bias)
+        self.variance_epsilon = eps
+        self._is_hf_initialized = True  # disable automatic init
+    def forward(self, hidden_states, cond_embedding):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        weight = self.to_weight(cond_embedding)
+        if len(weight.shape) == 2:
+            weight = weight.unsqueeze(1)
+        return (weight * hidden_states).to(input_dtype)
+class LlamaNARDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
+        """Override to adaptive layer norm"""
+        super().__init__(config, layer_idx)  # init attention, mlp, etc.
+        # self.self_attn = LlamaXformersAttention(config=config, layer_idx=layer_idx)
+        self.self_attn.is_causal = False  # for flash attn..
+        self.input_layernorm = LlamaAdaptiveRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, dim_cond=config.hidden_size
+        )
+        self.post_attention_layernorm = LlamaAdaptiveRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, dim_cond=config.hidden_size
+        )
+    # add `cond` in forward function
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_embedding: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[
+        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(
+            hidden_states, cond_embedding=cond_embedding
+        )
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(
+            hidden_states, cond_embedding=cond_embedding
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class DiffLlamaConcat(LlamaModel):
+    def __init__(
+        self,
+        mel_dim=100,
+        hidden_size=1024,
+        num_heads=16,
+        num_layers=16,
+        dropout=0.1,
+        ffn_dropout=0.1,
+        attention_dropout=0.0,
+        config=LlamaConfig(0, 256, 1024, 1, 1),
+        flash_attention=False,
+    ):
+        super().__init__(config)
+        self.flash_attention = flash_attention
+        self.layers = nn.ModuleList(
+            [
+                LlamaNARDecoderLayer(
+                    LlamaConfig(
+                        hidden_size=hidden_size,
+                        num_attention_heads=num_heads,
+                        max_position_embeddings=4096,
+                        intermediate_size=hidden_size * 4,
+                        attn_implementation=(
+                            "flash_attention_2" if self.flash_attention else "eager"
+                        ),
+                    ),
+                    layer_idx=i,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = LlamaAdaptiveRMSNorm(hidden_size, dim_cond=hidden_size)
+        self.diff_step_embedding = SinusoidalPosEmb(hidden_size)
+        self.diff_step_mlp = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size * 4),
+            nn.SiLU(),
+            nn.Linear(hidden_size * 4, hidden_size),
+        )
+        self.cond_mlp = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size * 4),
+            nn.SiLU(),
+            nn.Linear(hidden_size * 4, hidden_size),
+        )
+        self.mel_mlp = nn.Sequential(
+            nn.Linear(mel_dim, hidden_size * 4),
+            nn.SiLU(),
+            nn.Linear(hidden_size * 4, hidden_size),
+        )
+        self.mel_out_mlp = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size * 4),
+            nn.SiLU(),
+            nn.Linear(hidden_size * 4, mel_dim),
+        )
+        for layer in self.layers:
+            layer.input_layernorm = LlamaAdaptiveRMSNorm(
+                hidden_size, dim_cond=hidden_size
+            )
+            layer.post_attention_layernorm = LlamaAdaptiveRMSNorm(
+                hidden_size, dim_cond=hidden_size
+            )
+        self.embed_tokens = None
+        self.post_init()
+        # self.reset_parameters()
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
+        # create noncausal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        def _expand_mask(
+            mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+        ):
+            """
+            Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+            """
+            bsz, src_len = mask.size()
+            tgt_len = tgt_len if tgt_len is not None else src_len
+            expanded_mask = (
+                mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+            )
+            inverted_mask = 1.0 - expanded_mask
+            return inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(dtype).min
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        x,
+        diffusion_step,
+        x_mask,
+        cond,
+        input_ids: torch.LongTensor = None,  # [num_quant, B, T]
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # retrieve some shape info
+        batch_size, seq_length, _ = x.shape
+        # condtion mlp
+        cond_embedding = self.cond_mlp(cond)  # (B, T, C)
+        # condition mel
+        x = self.mel_mlp(x)
+        # diffusion step embedding
+        diffusion_step = self.diff_step_embedding(diffusion_step).to(x.device)
+        diffusion_step = self.diff_step_mlp(diffusion_step)  # (B, C)
+        x = x + cond_embedding
+        inputs_embeds = x
+        # if self.flash_attention:
+        #     attention_mask = None
+        # else:
+        attention_mask = x_mask
+        # assert x_mask.shape == batch_size, seq_length
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if not self.flash_attention:
+            # embed positions
+            if attention_mask is None:
+                attention_mask = torch.ones(
+                    (batch_size, seq_length_with_past),
+                    dtype=torch.bool,
+                    device=inputs_embeds.device,
+                )
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            if self.gradient_checkpointing and self.training:
+                raise NotImplementedError
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    # attention_mask=attention_mask if not self.flash_attention else None,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cond_embedding=diffusion_step,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states, cond_embedding=diffusion_step)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        hidden_states = self.mel_out_mlp(hidden_states)
+        if not output_hidden_states:
+            return hidden_states
+        else:
+            return {
+                "hidden_states": hidden_states,
+                "all_hidden_states": all_hidden_states,
+            }
+class DiffLlama(LlamaModel):
+    def __init__(
+        self,
+        mel_dim=100,
+        hidden_size=1024,
+        num_heads=16,
+        num_layers=16,
+        dropout=0.1,
+        ffn_dropout=0.1,
+        attention_dropout=0.0,
+        config=LlamaConfig(0, 256, 1024, 1, 1),
+        flash_attention=False,
+    ):
+        super().__init__(config)
+        self.flash_attention = flash_attention
+        self.layers = nn.ModuleList(
+            [
+                LlamaNARDecoderLayer(
+                    LlamaConfig(
+                        hidden_size=hidden_size,
+                        num_attention_heads=num_heads,
+                        max_position_embeddings=4096,
+                        intermediate_size=hidden_size * 4,
+                        attn_implementation=(
+                            "flash_attention_2" if self.flash_attention else "eager"
+                        ),
+                        is_causal=False,
+                    ),
+                    layer_idx=i,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.norm = LlamaAdaptiveRMSNorm(hidden_size, dim_cond=hidden_size)
+        self.diff_step_embedding = SinusoidalPosEmb(hidden_size)
+        self.diff_step_mlp = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size * 4),
+            nn.SiLU(),
+            nn.Linear(hidden_size * 4, hidden_size),
+        )
+        # self.cond_mlp = nn.Sequential(
+        #     nn.Linear(hidden_size, hidden_size * 4),
+        #     nn.SiLU(),
+        #     nn.Linear(hidden_size * 4, hidden_size),
+        # )
+        self.mel_mlp = nn.Sequential(
+            nn.Linear(mel_dim, hidden_size * 4),
+            nn.SiLU(),
+            nn.Linear(hidden_size * 4, hidden_size),
+        )
+        self.mel_out_mlp = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size * 4),
+            nn.SiLU(),
+            nn.Linear(hidden_size * 4, mel_dim),
+        )
+        for layer in self.layers:
+            layer.input_layernorm = LlamaAdaptiveRMSNorm(
+                hidden_size, dim_cond=hidden_size
+            )
+            layer.post_attention_layernorm = LlamaAdaptiveRMSNorm(
+                hidden_size, dim_cond=hidden_size
+            )
+        self.embed_tokens = None
+        self.post_init()
+        # self.reset_parameters()
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
+        # create noncausal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        def _expand_mask(
+            mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+        ):
+            """
+            Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+            """
+            bsz, src_len = mask.size()
+            tgt_len = tgt_len if tgt_len is not None else src_len
+            expanded_mask = (
+                mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+            )
+            inverted_mask = 1.0 - expanded_mask
+            return inverted_mask.masked_fill(
+                inverted_mask.to(torch.bool), torch.finfo(dtype).min
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        x,
+        diffusion_step,
+        x_mask,
+        cond,
+        input_ids: torch.LongTensor = None,  # [num_quant, B, T]
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        # retrieve some shape info
+        batch_size, seq_length, _ = x.shape
+        # condtion mlp
+        cond_embedding = self.cond_mlp(cond)  # (B, T, C)
+        # condition mel
+        x = self.mel_mlp(x)
+        # diffusion step embedding
+        diffusion_step = self.diff_step_embedding(diffusion_step).to(x.device)
+        diffusion_step = self.diff_step_mlp(diffusion_step)  # (B, C)
+        x = x + cond_embedding
+        inputs_embeds = x
+        attention_mask = x_mask
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            if self.gradient_checkpointing and self.training:
+                raise NotImplementedError
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cond_embedding=diffusion_step,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states, cond_embedding=diffusion_step)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        hidden_states = self.mel_out_mlp(hidden_states)
+        if not output_hidden_states:
+            return hidden_states
+        else:
+            return {
+                "hidden_states": hidden_states,
+                "all_hidden_states": all_hidden_states,
+            }

config/flow_matching.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+    "model_type": "Sing2SongNoText",
+    "preprocess": {
+        "use_mel": true,
+        "sample_rate": 24000,
+        "n_fft": 1920,
+        "num_mels": 128,
+        "sampling_rate": 24000,
+        "hop_size": 480,
+        "hop_size_vocal": 480,
+        "hop_size_accompaniment": 480,
+        "win_size": 1920,
+        "fmin": 0,
+        "fmax": 12000,
+        "mel_var": 8.14,
+        "mel_mean": -4.92,
+        "chromagram": {
+            "naive": true,
+            "hop_size": 480,
+            "sample_rate": 24000,
+            "n_fft": 1920,
+            "num_mels": 128,
+            "win_size": 1920,
+            "fmin": 0,
+            "fmax": 12000,
+            "mel_var": 8.14,
+            "mel_mean": -4.92,
+            "f0_fmin": 50.0,
+            "f0_fmax": 1100.0
+        }
+    },
+    "model": {
+        "flow_matching_transformer": {
+            "vocab_size": 512,
+            "use_cond_code": true,
+            "mel_dim": 128,
+            "cond_dim": 768,
+            "hidden_size": 1024,
+            "num_layers": 10,
+            "num_heads": 16,
+            "cfg_scale": 0.2,
+            "prompt_prob": 0.,
+            "use_pretrained_model": false,
+            "sigma": 1e-5,
+            "time_scheduler": "cos",
+            "repa_loss": {
+                "enable": true,
+                "weight": 0.5,
+                "repa_layer": 4,
+            },
+            "flash_attention": false,
+        },
+        "coco": {
+            "coco_type": "style", // content, style, or content_style
+            "downsample_rate": 1, // The original frame rate is 50 Hz, downsample to 6.25 Hz
+            "codebook_size": 512,
+            "hidden_size": 1024, // Representations Dim
+            "codebook_dim": 8,
+            "encoder": {
+                "vocos_dim": 384,
+                "vocos_intermediate_dim": 2048,
+                "vocos_num_layers": 12,
+            },
+            "decoder": {
+                "vocos_dim": 384,
+                "vocos_intermediate_dim": 2048,
+                "vocos_num_layers": 12,
+            },
+            "chromagram_dim": 24,
+            "pretrained_path": "./pretrained/vq"
+        },
+    },
+}

config/vocoder.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+    "model_type": "Vocoder",
+    "preprocess": {
+        "hop_size": 480,
+        "sample_rate": 24000,
+        "max_length": 36000,
+        "n_fft": 1920,
+        "num_mels": 128,
+        "win_size": 1920,
+        "fmin": 0,
+        "fmax": 12000,
+        "mel_var": 8.14,
+        "mel_mean": -4.92,
+        "processed_dir": "",
+        "valid_file": "valid.json",
+        "train_file": "train.json",
+        "use_phone_cond": false,
+        "use_emilia_101k": false
+    },
+    "model": {
+        "vocos": {
+            "input_channels": 128,
+            "dim": 1024,
+            "intermediate_dim": 4096,
+            "num_layers": 30,
+            "n_fft": 1920,
+            "hop_size": 480,
+            "padding": "same"
+        },
+        "period_gan": {
+            "max_downsample_channels": 1024,
+            "channels": 64,
+            "channel_increasing_factor": 2
+        },
+        "spec_gan": {
+            "stft_params": {
+                "fft_sizes": [128, 256, 512, 1024, 2048],
+                "hop_sizes": [32, 64, 128, 256, 512],
+                "win_lengths": [128, 256, 512, 1024, 2048],
+                "window": "hann_window"
+            },
+            "in_channels": 1,
+            "out_channels": 1,
+            "channels": 64,
+            "kernel_sizes": [5, 3],
+            "max_downsample_channels": 1024,
+            "down_scales": [2, 2, 2],
+            "use_weight_norm": true,
+            "use_complex": false
+        }
+    },
+}

example/gradio/example1.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2860cf5b49b861f0770805c8cdda5b61276abc3931bb11140a5d6fa451418130
+size 384580

example/gradio/example2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a16962f991ca69af95c79f39050017bd14e6dfd2c11b7547a10cd2b123b5ea6
+size 2646044

example/gradio/example3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a96d209d00b50cb0b2ea0d985d4d94a5ea29b20874e9f91f4ed15235d4018ec
+size 2646044

models/__init__.py ADDED Viewed

File without changes

models/codec/__init__.py ADDED Viewed

File without changes

models/codec/amphion_codec/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

models/codec/amphion_codec/quantize/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from models.codec.amphion_codec.quantize.factorized_vector_quantize import (
+    FactorizedVectorQuantize,
+)
+from models.codec.amphion_codec.quantize.vector_quantize import VectorQuantize
+from models.codec.amphion_codec.quantize.lookup_free_quantize import LookupFreeQuantize
+from models.codec.amphion_codec.quantize.residual_vq import ResidualVQ

models/codec/amphion_codec/quantize/factorized_vector_quantize.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+class FactorizedVectorQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        codebook_size,
+        codebook_dim,
+        commitment=0.005,
+        codebook_loss_weight=1.0,
+        use_l2_normlize=True,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.commitment = commitment
+        self.codebook_loss_weight = codebook_loss_weight
+        self.use_l2_normlize = use_l2_normlize
+        if self.input_dim != self.codebook_dim:
+            self.in_project = WNConv1d(self.input_dim, self.codebook_dim, kernel_size=1)
+            self.out_project = WNConv1d(
+                self.codebook_dim, self.input_dim, kernel_size=1
+            )
+        else:
+            self.in_project = nn.Identity()
+            self.out_project = nn.Identity()
+        self.codebook = nn.Embedding(self.codebook_size, self.codebook_dim)
+    def forward(self, z):
+        """
+        Parameters
+        ----------
+        z: torch.Tensor[B x D x T]
+        Returns
+        -------
+        z_q: torch.Tensor[B x D x T]
+            Quantized continuous representation of input
+        commit_loss: Tensor[B]
+            Commitment loss to train encoder to predict vectors closer to codebook entries
+        codebook_loss: Tensor[B]
+            Codebook loss to update the codebook
+        indices: torch.Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        z_e: torch.Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes project input into low-dimensional space if self.input_dim != self.codebook_dim
+        z_e = self.in_project(z)
+        z_q, indices = self.decode_latents(z_e)
+        # Compute commitment loss and codebook loss
+        if self.training:
+            commit_loss = (
+                F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+                * self.commitment
+            )
+            codebook_loss = (
+                F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+                * self.codebook_loss_weight
+            )
+        else:
+            commit_loss = torch.zeros(z.shape[0], device=z.device)
+            codebook_loss = torch.zeros(z.shape[0], device=z.device)
+        z_q = z_e + (z_q - z_e).detach()
+        z_q = self.out_project(z_q)
+        return z_q, commit_loss, codebook_loss, indices, z_e
+    def embed_code(self, embed_id):
+        return F.embedding(embed_id, self.codebook.weight)
+    def decode_code(self, embed_id):
+        return self.embed_code(embed_id).transpose(1, 2)
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight
+        # L2 normalize encodings and codebook
+        if self.use_l2_normlize:
+            encodings = F.normalize(encodings)
+            codebook = F.normalize(codebook)
+        # Compute euclidean distance between encodings and codebook,
+        # if use_l2_normlize is True, the distance is equal to cosine distance
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        return z_q, indices
+    def vq2emb(self, vq, out_proj=True):
+        emb = self.decode_code(vq)
+        if out_proj:
+            emb = self.out_project(emb)
+        return emb
+    def latent2dist(self, latents):
+        encodings = rearrange(latents, "b d t -> (b t) d")
+        codebook = self.codebook.weight
+        # L2 normalize encodings and codebook
+        if self.use_l2_normlize:
+            encodings = F.normalize(encodings)
+            codebook = F.normalize(codebook)
+        # Compute euclidean distance between encodings and codebook,
+        # if use_l2_normlize is True, the distance is equal to cosine distance
+        dist = (
+            encodings.pow(2).sum(1, keepdim=True)
+            - 2 * encodings @ codebook.t()
+            + codebook.pow(2).sum(1, keepdim=True).t()
+        )  # (b*t, k)
+        indices = rearrange((-dist).max(1)[1], "(b t) -> b t", b=latents.size(0))
+        dist = rearrange(dist, "(b t) k -> b t k", b=latents.size(0))
+        z_q = self.decode_code(indices)
+        return -dist, indices, z_q

models/codec/amphion_codec/quantize/lookup_free_quantize.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+class LookupFreeQuantize(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        codebook_size,
+        codebook_dim,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        assert 2**codebook_dim == codebook_size
+        if self.input_dim != self.codebook_dim:
+            self.in_project = WNConv1d(self.input_dim, self.codebook_dim, kernel_size=1)
+            self.out_project = WNConv1d(
+                self.codebook_dim, self.input_dim, kernel_size=1
+            )
+        else:
+            self.in_project = nn.Identity()
+            self.out_project = nn.Identity()
+    def forward(self, z):
+        z_e = self.in_project(z)
+        z_e = F.sigmoid(z_e)
+        z_q = z_e + (torch.round(z_e) - z_e).detach()
+        z_q = self.out_project(z_q)
+        commit_loss = torch.zeros(z.shape[0], device=z.device)
+        codebook_loss = torch.zeros(z.shape[0], device=z.device)
+        bits = (
+            2
+            ** torch.arange(self.codebook_dim, device=z.device)
+            .unsqueeze(0)
+            .unsqueeze(-1)
+            .long()
+        )  # (1, d, 1)
+        indices = (torch.round(z_e.clone().detach()).long() * bits).sum(1).long()
+        return z_q, commit_loss, codebook_loss, indices, z_e
+    def vq2emb(self, vq, out_proj=True):
+        emb = torch.zeros(
+            vq.shape[0], self.codebook_dim, vq.shape[-1], device=vq.device
+        )  # (B, d, T)
+        for i in range(self.codebook_dim):
+            emb[:, i, :] = (vq % 2).float()
+            vq = vq // 2
+        if out_proj:
+            emb = self.out_project(emb)
+        return emb

models/codec/amphion_codec/quantize/residual_vq.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+from models.codec.amphion_codec.quantize.factorized_vector_quantize import (
+    FactorizedVectorQuantize,
+)
+from models.codec.amphion_codec.quantize.vector_quantize import VectorQuantize
+from models.codec.amphion_codec.quantize.lookup_free_quantize import LookupFreeQuantize
+class ResidualVQ(nn.Module):
+    """
+    Introduced in SoundStream: An end2end neural audio codec
+    https://arxiv.org/abs/2107.03312
+    """
+    def __init__(
+        self,
+        input_dim: int = 256,
+        num_quantizers: int = 8,
+        codebook_size: int = 1024,
+        codebook_dim: int = 256,
+        quantizer_type: str = "vq",  # "vq" or "fvq" or "lfq"
+        quantizer_dropout: float = 0.5,
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.num_quantizers = num_quantizers
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.quantizer_type = quantizer_type
+        self.quantizer_dropout = quantizer_dropout
+        if quantizer_type == "vq":
+            VQ = VectorQuantize
+        elif quantizer_type == "fvq":
+            VQ = FactorizedVectorQuantize
+        elif quantizer_type == "lfq":
+            VQ = LookupFreeQuantize
+        else:
+            raise ValueError(f"Unknown quantizer type {quantizer_type}")
+        self.quantizers = nn.ModuleList(
+            [
+                VQ(
+                    input_dim=input_dim,
+                    codebook_size=codebook_size,
+                    codebook_dim=codebook_dim,
+                    **kwargs,
+                )
+                for _ in range(num_quantizers)
+            ]
+        )
+    def forward(self, z, n_quantizers: int = None):
+        """
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        "quantized_out" : Tensor[B x D x T]
+            Quantized continuous representation of input
+        "all_indices" : Tensor[N x B x T]
+            Codebook indices for each codebook
+            (quantized discrete representation of input)
+        "all_commit_losses" : Tensor[N]
+        "all_codebook_losses" : Tensor[N]
+        "all_quantized" : Tensor[N x B x D x T]
+        """
+        quantized_out = 0.0
+        residual = z
+        all_commit_losses = []
+        all_codebook_losses = []
+        all_indices = []
+        all_quantized = []
+        if n_quantizers is None:
+            n_quantizers = self.num_quantizers
+        if self.training:
+            n_quantizers = torch.ones((z.shape[0],)) * self.num_quantizers + 1
+            dropout = torch.randint(1, self.num_quantizers + 1, (z.shape[0],))
+            n_dropout = int(z.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            if self.training is False and i >= n_quantizers:
+                break
+            z_q_i, commit_loss_i, codebook_loss_i, indices_i, z_e_i = quantizer(
+                residual
+            )
+            # Create mask to apply quantizer dropout
+            mask = (
+                torch.full((z.shape[0],), fill_value=i, device=z.device) < n_quantizers
+            )
+            quantized_out = quantized_out + z_q_i * mask[:, None, None]
+            residual = residual - z_q_i
+            commit_loss_i = (commit_loss_i * mask).mean()
+            codebook_loss_i = (codebook_loss_i * mask).mean()
+            all_commit_losses.append(commit_loss_i)
+            all_codebook_losses.append(codebook_loss_i)
+            all_indices.append(indices_i)
+            all_quantized.append(z_q_i)
+        all_commit_losses, all_codebook_losses, all_indices, all_quantized = map(
+            torch.stack,
+            (all_commit_losses, all_codebook_losses, all_indices, all_quantized),
+        )
+        return (
+            quantized_out,
+            all_indices,
+            all_commit_losses,
+            all_codebook_losses,
+            all_quantized,
+        )
+    def vq2emb(self, vq, n_quantizers=None):
+        quantized_out = 0.0
+        if n_quantizers is None:
+            n_quantizers = self.num_quantizers
+        for idx, quantizer in enumerate(self.quantizers):
+            if idx >= n_quantizers:
+                break
+            quantized_out += quantizer.vq2emb(vq[idx])
+        return quantized_out
+    def latent2dist(self, z, n_quantizers=None):
+        quantized_out = 0.0
+        residual = z
+        all_dists = []
+        all_indices = []
+        if n_quantizers is None:
+            n_quantizers = self.num_quantizers
+        for i, quantizer in enumerate(self.quantizers):
+            if self.training is False and i >= n_quantizers:
+                break
+            dist_i, indices_i, z_q_i = quantizer.latent2dist(residual)
+            all_dists.append(dist_i)
+            all_indices.append(indices_i)
+            quantized_out = quantized_out + z_q_i
+            residual = residual - z_q_i
+        all_dists = torch.stack(all_dists)
+        all_indices = torch.stack(all_indices)
+        return all_dists, all_indices

models/codec/amphion_codec/quantize/vector_quantize.py ADDED Viewed

	@@ -0,0 +1,401 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+def l2norm(t):
+    return F.normalize(t, p=2, dim=-1)
+def ema_inplace(moving_avg, new, decay):
+    moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay))
+def laplace_smoothing(x, n_categories, eps=1e-5):
+    return (x + eps) / (x.sum() + n_categories * eps)
+def sample_vectors(samples, num):
+    num_samples, device = samples.shape[0], samples.device
+    if num_samples >= num:
+        indices = torch.randperm(num_samples, device=device)[:num]
+    else:
+        indices = torch.randint(0, num_samples, (num,), device=device)
+    return samples[indices]
+def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False):
+    dim, dtype, device = samples.shape[-1], samples.dtype, samples.device
+    means = sample_vectors(samples, num_clusters)
+    for _ in range(num_iters):
+        if use_cosine_sim:
+            dists = samples @ means.t()
+        else:
+            diffs = rearrange(samples, "n d -> n () d") - rearrange(
+                means, "c d -> () c d"
+            )
+            dists = -(diffs**2).sum(dim=-1)
+        buckets = dists.max(dim=-1).indices
+        bins = torch.bincount(buckets, minlength=num_clusters)
+        zero_mask = bins == 0
+        bins_min_clamped = bins.masked_fill(zero_mask, 1)
+        new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype)
+        new_means.scatter_add_(0, repeat(buckets, "n -> n d", d=dim), samples)
+        new_means = new_means / bins_min_clamped[..., None]
+        if use_cosine_sim:
+            new_means = l2norm(new_means)
+        means = torch.where(zero_mask[..., None], means, new_means)
+    return means, bins
+class EuclideanCodebook(nn.Module):
+    def __init__(
+        self,
+        dim,
+        codebook_size,
+        kmeans_init=False,
+        kmeans_iters=10,
+        decay=0.8,
+        eps=1e-5,
+        threshold_ema_dead_code=2,
+        weight_init=False,
+    ):
+        super().__init__()
+        self.decay = decay
+        init_fn = torch.randn if not weight_init else torch.zeros
+        embed = init_fn(codebook_size, dim)
+        if weight_init:
+            nn.init.uniform_(embed, -1 / codebook_size, 1 / codebook_size)
+        self.codebook_size = codebook_size
+        self.kmeans_iters = kmeans_iters
+        self.eps = eps
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.register_buffer(
+            "initted", torch.Tensor([not kmeans_init])
+        )  # if kmeans_init is True, then initted is False; otherwise, initted is True
+        self.register_buffer("cluster_size", torch.zeros(codebook_size))
+        self.register_buffer("embed", embed)
+        self.register_buffer("embed_avg", embed.clone())
+    def init_embed_(self, data):
+        embed, cluster_size = kmeans(data, self.codebook_size, self.kmeans_iters)
+        self.embed.data.copy_(embed)
+        self.embed_avg.data.copy_(embed)
+        self.cluster_size.data.copy_(cluster_size)
+        self.initted.data.copy_(torch.Tensor([True]))
+    def replace(self, samples, mask):
+        modified_codebook = torch.where(
+            mask[..., None], sample_vectors(samples, self.codebook_size), self.embed
+        )
+        self.embed.data.copy_(modified_codebook)
+    def expire_codes_(self, batch_samples):
+        if self.threshold_ema_dead_code == 0:
+            return
+        expired_codes = self.cluster_size < self.threshold_ema_dead_code
+        if not torch.any(expired_codes):
+            return
+        batch_samples = rearrange(batch_samples, "... d -> (...) d")
+        self.replace(batch_samples, mask=expired_codes)
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        flatten = rearrange(x, "... d -> (...) d")
+        embed = self.embed.t()  # (codebook_size, dim) -> (dim, codebook_size)
+        if not self.initted:
+            self.init_embed_(flatten)
+        dist = -(
+            flatten.pow(2).sum(1, keepdim=True)
+            - 2 * flatten @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        embed_onehot = F.one_hot(embed_ind, self.codebook_size).type(dtype)
+        embed_ind = embed_ind.view(*shape[:-1])
+        quantize = F.embedding(embed_ind, self.embed)
+        if self.training:
+            ema_inplace(self.cluster_size, embed_onehot.sum(0), self.decay)
+            embed_sum = (
+                flatten.t() @ embed_onehot
+            )  # (dim, ...) @ (..., codebook_size) -> (dim, codebook_size)
+            ema_inplace(self.embed_avg, embed_sum.t(), self.decay)
+            cluster_size = (
+                laplace_smoothing(self.cluster_size, self.codebook_size, self.eps)
+                * self.cluster_size.sum()
+            )
+            embed_normalized = self.embed_avg / cluster_size.unsqueeze(1)
+            self.embed.data.copy_(embed_normalized)
+            self.expire_codes_(x)
+        return quantize, embed_ind
+    def vq2emb(self, vq):
+        quantize = F.embedding(vq, self.embed)
+        return quantize
+    def latent2dist(self, x):
+        shape, dtype = x.shape, x.dtype
+        flatten = rearrange(x, "... d -> (...) d")
+        embed = self.embed.t()  # (codebook_size, dim) -> (dim, codebook_size)
+        if not self.initted:
+            self.init_embed_(flatten)
+        dist = -(
+            flatten.pow(2).sum(1, keepdim=True)
+            - 2 * flatten @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        embed_ind = embed_ind.view(*shape[:-1])
+        quantize = F.embedding(embed_ind, self.embed)
+        dist = dist.view(*shape[:-1], -1)
+        return dist, embed_ind, quantize
+class SimpleCodebook(nn.Module):
+    def __init__(
+        self,
+        dim,
+        codebook_size,
+        use_l2_normlize=False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.codebook_size = codebook_size
+        self.use_l2_normlize = use_l2_normlize
+        self.embed = nn.Embedding(self.codebook_size, self.dim)
+    def forward(self, x):
+        shape, dtype = x.shape, x.dtype
+        flatten = rearrange(x, "... d -> (...) d")
+        embed = self.embed.weight.t()  # (codebook_size, dim) -> (dim, codebook_size)
+        if self.use_l2_normlize:
+            flatten = F.normalize(flatten)
+            embed = F.normalize(embed)
+        dist = -(
+            flatten.pow(2).sum(1, keepdim=True)
+            - 2 * flatten @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        embed_ind = embed_ind.view(*shape[:-1])
+        quantize = F.embedding(embed_ind, self.embed)
+        return quantize, embed_ind
+    def vq2emb(self, vq):
+        quantize = F.embedding(vq, self.embed.weight)
+        return quantize
+    def latent2dist(self, x):
+        shape, dtype = x.shape, x.dtype
+        flatten = rearrange(x, "... d -> (...) d")
+        embed = self.embed.weight.t()  # (codebook_size, dim) -> (dim, codebook_size)
+        if self.use_l2_normlize:
+            flatten = F.normalize(flatten)
+            embed = F.normalize(embed)
+        dist = -(
+            flatten.pow(2).sum(1, keepdim=True)
+            - 2 * flatten @ embed
+            + embed.pow(2).sum(0, keepdim=True)
+        )
+        embed_ind = dist.max(dim=-1).indices
+        embed_ind = embed_ind.view(*shape[:-1])
+        quantize = F.embedding(embed_ind, self.embed)
+        dist = dist.view(*shape[:-1], -1)
+        return dist, embed_ind, quantize
+class VectorQuantize(nn.Module):
+    """Vector quantization and factorized vecotor quantization implementation
+    Args:
+        input_dim (int): Dimension of input.
+        codebook_size (int): Codebook size.
+        codebook_dim (int): Codebook dimension. We suggest use codebook_dim = input_dim
+            if use codebook_type == "euclidean", otherwise, if you want to use
+            factorized vector quantization, use codebook_dim as small number (e.g. 8 or 32).
+        commitment (float): Weight for commitment loss.
+        use_l2_normlize (bool): Whether to use l2 normlized codes for factorized vecotor quantization,
+            we suggest use it as True if you want to use factorized vector quantization
+        kmeans_init (bool): Whether to use kmeans to initialize the codebooks.
+        kmeans_iters (int): Number of iterations used for kmeans initialization.
+        decay (float): Decay for exponential moving average over the codebooks.
+        epsilon (float): Epsilon value for numerical stability.
+        threshold_ema_dead_code (int): Threshold for dead code expiration. Replace any codes
+            that have an exponential moving average cluster size less than the specified threshold with
+            randomly selected vector from the current batch.
+    """
+    def __init__(
+        self,
+        input_dim,
+        codebook_size,
+        codebook_dim,
+        commitment=0.005,
+        codebook_loss_weight=1.0,
+        use_l2_normlize=False,
+        codebook_type="euclidean",  # "euclidean" or "simple"
+        kmeans_init=False,
+        kmeans_iters=10,
+        decay=0.8,
+        eps=1e-5,
+        threshold_ema_dead_code=2,
+        weight_init=False,
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.commitment = commitment
+        self.codebook_loss_weight = codebook_loss_weight
+        self.use_l2_normlize = use_l2_normlize
+        self.codebook_type = codebook_type
+        self.kmeans_init = kmeans_init
+        self.kmeans_iters = kmeans_iters
+        self.decay = decay
+        self.eps = eps
+        self.threshold_ema_dead_code = threshold_ema_dead_code
+        self.weight_init = weight_init
+        if self.input_dim != self.codebook_dim:
+            self.in_project = WNConv1d(self.input_dim, self.codebook_dim, kernel_size=1)
+            self.out_project = WNConv1d(
+                self.codebook_dim, self.input_dim, kernel_size=1
+            )
+        else:
+            self.in_project = nn.Identity()
+            self.out_project = nn.Identity()
+        if self.codebook_type == "euclidean":
+            self.codebook = EuclideanCodebook(
+                self.codebook_dim,
+                codebook_size=self.codebook_size,
+                kmeans_init=self.kmeans_init,
+                kmeans_iters=self.kmeans_iters,
+                decay=self.decay,
+                eps=self.eps,
+                threshold_ema_dead_code=self.threshold_ema_dead_code,
+                weight_init=self.weight_init,
+            )
+        elif self.codebook_type == "simple":
+            self.codebook = SimpleCodebook(
+                self.codebook_dim,
+                codebook_size=self.codebook_size,
+                use_l2_normlize=self.use_l2_normlize,
+            )
+        else:
+            raise NotImplementedError(
+                f"codebook_type {self.codebook_type} is not implemented!"
+            )
+    def forward(self, z):
+        """
+        Parameters
+        ----------
+        z: torch.Tensor[B x D x T]
+        Returns
+        -------
+        z_q: torch.Tensor[B x D x T]
+            Quantized continuous representation of input
+        commit_loss: Tensor[B]
+            Commitment loss to train encoder to predict vectors closer to codebook entries
+        codebook_loss: Tensor[B]
+            Codebook loss to update the codebook
+        indices: torch.Tensor[B x T]
+            Codebook indices (quantized discrete representation of input)
+        z_e: torch.Tensor[B x D x T]
+            Projected latents (continuous representation of input before quantization)
+        """
+        # Factorized codes project input into low-dimensional space if self.input_dim != self.codebook_dim
+        z_e = self.in_project(z)
+        z_q, indices = self.decode_latents(z_e)
+        # Compute commitment loss and codebook loss
+        if self.training:
+            commit_loss = (
+                F.mse_loss(z_e, z_q.detach(), reduction="none").mean([1, 2])
+                * self.commitment
+            )
+            codebook_loss = (
+                F.mse_loss(z_q, z_e.detach(), reduction="none").mean([1, 2])
+                * self.codebook_loss_weight
+            )
+        else:
+            commit_loss = torch.zeros(z.shape[0], device=z.device)
+            codebook_loss = torch.zeros(z.shape[0], device=z.device)
+        z_q = z_e + (z_q - z_e).detach()
+        z_q = self.out_project(z_q)
+        return z_q, commit_loss, codebook_loss, indices, z_e
+    def decode_latents(self, latents):
+        encodings = rearrange(latents, "b d t -> b t d")
+        z_q, indices = self.codebook(encodings)
+        z_q = z_q.transpose(1, 2)
+        return z_q, indices
+    def vq2emb(self, vq, out_proj=True):
+        emb = self.codebook.vq2emb(vq)
+        emb = emb.transpose(1, 2)
+        if out_proj:
+            emb = self.out_project(emb)
+        return emb
+    def latent2dist(self, latents):
+        latents = rearrange(latents, "b d t -> b t d")
+        dist, embed_ind, quantize = self.codebook.latent2dist(latents)
+        return dist, embed_ind, quantize.transpose(1, 2)

models/codec/amphion_codec/vocos.py ADDED Viewed

	@@ -0,0 +1,881 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional, Tuple
+import numpy as np
+import scipy
+import torch
+from torch import nn, view_as_real, view_as_complex
+from torch import nn
+from torch.nn.utils import weight_norm, remove_weight_norm
+from torchaudio.functional.functional import _hz_to_mel, _mel_to_hz
+import librosa
+def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
+    """
+    Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.
+    Args:
+        x (Tensor): Input tensor.
+        clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.
+    Returns:
+        Tensor: Element-wise logarithm of the input tensor with clipping applied.
+    """
+    return torch.log(torch.clip(x, min=clip_val))
+def symlog(x: torch.Tensor) -> torch.Tensor:
+    return torch.sign(x) * torch.log1p(x.abs())
+def symexp(x: torch.Tensor) -> torch.Tensor:
+    return torch.sign(x) * (torch.exp(x.abs()) - 1)
+class STFT(nn.Module):
+    def __init__(
+        self,
+        n_fft: int,
+        hop_length: int,
+        win_length: int,
+        center=True,
+    ):
+        super().__init__()
+        self.center = center
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        window = torch.hann_window(win_length)
+        self.register_buffer("window", window)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, T * hop_length)
+        if not self.center:
+            pad = self.win_length - self.hop_length
+            x = torch.nn.functional.pad(x, (pad // 2, pad // 2), mode="reflect")
+        stft_spec = torch.stft(
+            x,
+            self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            window=self.window,
+            center=self.center,
+            return_complex=False,
+        )  # (B, n_fft // 2 + 1, T, 2)
+        rea = stft_spec[:, :, :, 0]  # (B, n_fft // 2 + 1, T, 2)
+        imag = stft_spec[:, :, :, 1]  # (B, n_fft // 2 + 1, T, 2)
+        log_mag = torch.log(
+            torch.abs(torch.sqrt(torch.pow(rea, 2) + torch.pow(imag, 2))) + 1e-5
+        )  # (B, n_fft // 2 + 1, T)
+        phase = torch.atan2(imag, rea)  # (B, n_fft // 2 + 1, T)
+        return log_mag, phase
+class ISTFT(nn.Module):
+    """
+    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
+    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
+    See issue: https://github.com/pytorch/pytorch/issues/62323
+    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
+    The NOLA constraint is met as we trim padded samples anyway.
+    Args:
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames.
+        win_length (int): The size of window frame and STFT filter.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(
+        self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"
+    ):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        window = torch.hann_window(win_length)
+        self.register_buffer("window", window)
+    def forward(self, spec: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
+        Args:
+            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
+                            N is the number of frequency bins, and T is the number of time frames.
+        Returns:
+            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
+        """
+        if self.padding == "center":
+            # Fallback to pytorch native implementation
+            return torch.istft(
+                spec,
+                self.n_fft,
+                self.hop_length,
+                self.win_length,
+                self.window,
+                center=True,
+            )
+        elif self.padding == "same":
+            pad = (self.win_length - self.hop_length) // 2
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        assert spec.dim() == 3, "Expected a 3D tensor as input"
+        B, N, T = spec.shape
+        # Inverse FFT
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]
+        # Overlap and Add
+        output_size = (T - 1) * self.hop_length + self.win_length
+        y = torch.nn.functional.fold(
+            ifft,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        )[:, 0, 0, pad:-pad]
+        # Window envelope
+        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+        window_envelope = torch.nn.functional.fold(
+            window_sq,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        ).squeeze()[pad:-pad]
+        # Normalize
+        assert (window_envelope > 1e-11).all()
+        y = y / window_envelope
+        return y
+class MDCT(nn.Module):
+    """
+    Modified Discrete Cosine Transform (MDCT) module.
+    Args:
+        frame_len (int): Length of the MDCT frame.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, frame_len: int, padding: str = "same"):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.frame_len = frame_len
+        N = frame_len // 2
+        n0 = (N + 1) / 2
+        window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
+        self.register_buffer("window", window)
+        pre_twiddle = torch.exp(-1j * torch.pi * torch.arange(frame_len) / frame_len)
+        post_twiddle = torch.exp(-1j * torch.pi * n0 * (torch.arange(N) + 0.5) / N)
+        # view_as_real: NCCL Backend does not support ComplexFloat data type
+        # https://github.com/pytorch/pytorch/issues/71613
+        self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
+        self.register_buffer("post_twiddle", view_as_real(post_twiddle))
+    def forward(self, audio: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the Modified Discrete Cosine Transform (MDCT) to the input audio.
+        Args:
+            audio (Tensor): Input audio waveform of shape (B, T), where B is the batch size
+                and T is the length of the audio.
+        Returns:
+            Tensor: MDCT coefficients of shape (B, L, N), where L is the number of output frames
+                and N is the number of frequency bins.
+        """
+        if self.padding == "center":
+            audio = torch.nn.functional.pad(
+                audio, (self.frame_len // 2, self.frame_len // 2)
+            )
+        elif self.padding == "same":
+            # hop_length is 1/2 frame_len
+            audio = torch.nn.functional.pad(
+                audio, (self.frame_len // 4, self.frame_len // 4)
+            )
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        x = audio.unfold(-1, self.frame_len, self.frame_len // 2)
+        N = self.frame_len // 2
+        x = x * self.window.expand(x.shape)
+        X = torch.fft.fft(
+            x * view_as_complex(self.pre_twiddle).expand(x.shape), dim=-1
+        )[..., :N]
+        res = X * view_as_complex(self.post_twiddle).expand(X.shape) * np.sqrt(1 / N)
+        return torch.real(res) * np.sqrt(2)
+class IMDCT(nn.Module):
+    """
+    Inverse Modified Discrete Cosine Transform (IMDCT) module.
+    Args:
+        frame_len (int): Length of the MDCT frame.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, frame_len: int, padding: str = "same"):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.frame_len = frame_len
+        N = frame_len // 2
+        n0 = (N + 1) / 2
+        window = torch.from_numpy(scipy.signal.cosine(frame_len)).float()
+        self.register_buffer("window", window)
+        pre_twiddle = torch.exp(1j * torch.pi * n0 * torch.arange(N * 2) / N)
+        post_twiddle = torch.exp(1j * torch.pi * (torch.arange(N * 2) + n0) / (N * 2))
+        self.register_buffer("pre_twiddle", view_as_real(pre_twiddle))
+        self.register_buffer("post_twiddle", view_as_real(post_twiddle))
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients.
+        Args:
+            X (Tensor): Input MDCT coefficients of shape (B, L, N), where B is the batch size,
+                L is the number of frames, and N is the number of frequency bins.
+        Returns:
+            Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio.
+        """
+        B, L, N = X.shape
+        Y = torch.zeros((B, L, N * 2), dtype=X.dtype, device=X.device)
+        Y[..., :N] = X
+        Y[..., N:] = -1 * torch.conj(torch.flip(X, dims=(-1,)))
+        y = torch.fft.ifft(
+            Y * view_as_complex(self.pre_twiddle).expand(Y.shape), dim=-1
+        )
+        y = (
+            torch.real(y * view_as_complex(self.post_twiddle).expand(y.shape))
+            * np.sqrt(N)
+            * np.sqrt(2)
+        )
+        result = y * self.window.expand(y.shape)
+        output_size = (1, (L + 1) * N)
+        audio = torch.nn.functional.fold(
+            result.transpose(1, 2),
+            output_size=output_size,
+            kernel_size=(1, self.frame_len),
+            stride=(1, self.frame_len // 2),
+        )[:, 0, 0, :]
+        if self.padding == "center":
+            pad = self.frame_len // 2
+        elif self.padding == "same":
+            pad = self.frame_len // 4
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        audio = audio[:, pad:-pad]
+        return audio
+class FourierHead(nn.Module):
+    """Base class for inverse fourier modules."""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+class ISTFTHead(FourierHead):
+    """
+    ISTFT Head module for predicting STFT complex coefficients.
+    Args:
+        dim (int): Hidden dimension of the model.
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames, which should align with
+                          the resolution of the input features.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
+        super().__init__()
+        out_dim = n_fft + 2
+        self.out = torch.nn.Linear(dim, out_dim)
+        self.istft = ISTFT(
+            n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the ISTFTHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x = self.out(x).transpose(1, 2)
+        mag, p = x.chunk(2, dim=1)
+        mag = torch.exp(mag)
+        mag = torch.clip(
+            mag, max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        # wrapping happens here. These two lines produce real and imaginary value
+        x = torch.cos(p)
+        y = torch.sin(p)
+        # recalculating phase here does not produce anything new
+        # only costs time
+        # phase = torch.atan2(y, x)
+        # S = mag * torch.exp(phase * 1j)
+        # better directly produce the complex value
+        S = mag * (x + 1j * y)
+        audio = self.istft(S)
+        return audio
+class IMDCTSymExpHead(FourierHead):
+    """
+    IMDCT Head module for predicting MDCT coefficients with symmetric exponential function
+    Args:
+        dim (int): Hidden dimension of the model.
+        mdct_frame_len (int): Length of the MDCT frame.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+        sample_rate (int, optional): The sample rate of the audio. If provided, the last layer will be initialized
+                                     based on perceptual scaling. Defaults to None.
+        clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
+    """
+    def __init__(
+        self,
+        dim: int,
+        mdct_frame_len: int,
+        padding: str = "same",
+        sample_rate: Optional[int] = None,
+        clip_audio: bool = False,
+    ):
+        super().__init__()
+        out_dim = mdct_frame_len // 2
+        self.out = nn.Linear(dim, out_dim)
+        self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding)
+        self.clip_audio = clip_audio
+        if sample_rate is not None:
+            # optionally init the last layer following mel-scale
+            m_max = _hz_to_mel(sample_rate // 2)
+            m_pts = torch.linspace(0, m_max, out_dim)
+            f_pts = _mel_to_hz(m_pts)
+            scale = 1 - (f_pts / f_pts.max())
+            with torch.no_grad():
+                self.out.weight.mul_(scale.view(-1, 1))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the IMDCTSymExpHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x = self.out(x)
+        x = symexp(x)
+        x = torch.clip(
+            x, min=-1e2, max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        audio = self.imdct(x)
+        if self.clip_audio:
+            audio = torch.clip(x, min=-1.0, max=1.0)
+        return audio
+class IMDCTCosHead(FourierHead):
+    """
+    IMDCT Head module for predicting MDCT coefficients with parametrizing MDCT = exp(m) · cos(p)
+    Args:
+        dim (int): Hidden dimension of the model.
+        mdct_frame_len (int): Length of the MDCT frame.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+        clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
+    """
+    def __init__(
+        self,
+        dim: int,
+        mdct_frame_len: int,
+        padding: str = "same",
+        clip_audio: bool = False,
+    ):
+        super().__init__()
+        self.clip_audio = clip_audio
+        self.out = nn.Linear(dim, mdct_frame_len)
+        self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the IMDCTCosHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x = self.out(x)
+        m, p = x.chunk(2, dim=2)
+        m = torch.exp(m).clip(
+            max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        audio = self.imdct(m * torch.cos(p))
+        if self.clip_audio:
+            audio = torch.clip(x, min=-1.0, max=1.0)
+        return audio
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+            None means non-conditional LayerNorm. Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        layer_scale_init_value: float,
+        adanorm_num_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=3, groups=dim
+        )  # depthwise conv
+        self.adanorm = adanorm_num_embeddings is not None
+        if adanorm_num_embeddings:
+            self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def forward(
+        self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+        if self.adanorm:
+            assert cond_embedding_id is not None
+            x = self.norm(x, cond_embedding_id)
+        else:
+            x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = residual + x
+        return x
+class AdaLayerNorm(nn.Module):
+    """
+    Adaptive Layer Normalization module with learnable embeddings per `num_embeddings` classes
+    Args:
+        num_embeddings (int): Number of embeddings.
+        embedding_dim (int): Dimension of the embeddings.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.dim = embedding_dim
+        self.scale = nn.Embedding(
+            num_embeddings=num_embeddings, embedding_dim=embedding_dim
+        )
+        self.shift = nn.Embedding(
+            num_embeddings=num_embeddings, embedding_dim=embedding_dim
+        )
+        torch.nn.init.ones_(self.scale.weight)
+        torch.nn.init.zeros_(self.shift.weight)
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor) -> torch.Tensor:
+        scale = self.scale(cond_embedding_id)
+        shift = self.shift(cond_embedding_id)
+        x = nn.functional.layer_norm(x, (self.dim,), eps=self.eps)
+        x = x * scale + shift
+        return x
+class ResBlock1(nn.Module):
+    """
+    ResBlock adapted from HiFi-GAN V1 (https://github.com/jik876/hifi-gan) with dilated 1D convolutions,
+    but without upsampling layers.
+    Args:
+        dim (int): Number of input channels.
+        kernel_size (int, optional): Size of the convolutional kernel. Defaults to 3.
+        dilation (tuple[int], optional): Dilation factors for the dilated convolutions.
+            Defaults to (1, 3, 5).
+        lrelu_slope (float, optional): Negative slope of the LeakyReLU activation function.
+            Defaults to 0.1.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        kernel_size: int = 3,
+        dilation: Tuple[int, int, int] = (1, 3, 5),
+        lrelu_slope: float = 0.1,
+        layer_scale_init_value: Optional[float] = None,
+    ):
+        super().__init__()
+        self.lrelu_slope = lrelu_slope
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[0],
+                        padding=self.get_padding(kernel_size, dilation[0]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[1],
+                        padding=self.get_padding(kernel_size, dilation[1]),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=dilation[2],
+                        padding=self.get_padding(kernel_size, dilation[2]),
+                    )
+                ),
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self.get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self.get_padding(kernel_size, 1),
+                    )
+                ),
+                weight_norm(
+                    nn.Conv1d(
+                        dim,
+                        dim,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=self.get_padding(kernel_size, 1),
+                    )
+                ),
+            ]
+        )
+        self.gamma = nn.ParameterList(
+            [
+                (
+                    nn.Parameter(
+                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
+                    )
+                    if layer_scale_init_value is not None
+                    else None
+                ),
+                (
+                    nn.Parameter(
+                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
+                    )
+                    if layer_scale_init_value is not None
+                    else None
+                ),
+                (
+                    nn.Parameter(
+                        layer_scale_init_value * torch.ones(dim, 1), requires_grad=True
+                    )
+                    if layer_scale_init_value is not None
+                    else None
+                ),
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for c1, c2, gamma in zip(self.convs1, self.convs2, self.gamma):
+            xt = torch.nn.functional.leaky_relu(x, negative_slope=self.lrelu_slope)
+            xt = c1(xt)
+            xt = torch.nn.functional.leaky_relu(xt, negative_slope=self.lrelu_slope)
+            xt = c2(xt)
+            if gamma is not None:
+                xt = gamma * xt
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+    @staticmethod
+    def get_padding(kernel_size: int, dilation: int = 1) -> int:
+        return int((kernel_size * dilation - dilation) / 2)
+class Backbone(nn.Module):
+    """Base class for the generator's backbone. It preserves the same temporal resolution across all layers."""
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
+                        C denotes output features, and L is the sequence length.
+        Returns:
+            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
+                    and H denotes the model dimension.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+class VocosBackbone(Backbone):
+    """
+    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization
+    Args:
+        input_channels (int): Number of input features channels.
+        dim (int): Hidden dimension of the model.
+        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
+        num_layers (int): Number of ConvNeXtBlock layers.
+        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+                                                None means non-conditional model. Defaults to None.
+    """
+    def __init__(
+        self,
+        input_channels: int,
+        dim: int,
+        intermediate_dim: int,
+        num_layers: int,
+        layer_scale_init_value: Optional[float] = None,
+        adanorm_num_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.embed = nn.Conv1d(input_channels, dim, kernel_size=7, padding=3)
+        self.adanorm = adanorm_num_embeddings is not None
+        if adanorm_num_embeddings:
+            self.norm = AdaLayerNorm(adanorm_num_embeddings, dim, eps=1e-6)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=1e-6)
+        layer_scale_init_value = layer_scale_init_value or 1 / num_layers
+        self.convnext = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    dim=dim,
+                    intermediate_dim=intermediate_dim,
+                    layer_scale_init_value=layer_scale_init_value,
+                    adanorm_num_embeddings=adanorm_num_embeddings,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        bandwidth_id = kwargs.get("bandwidth_id", None)
+        x = self.embed(x)
+        if self.adanorm:
+            assert bandwidth_id is not None
+            x = self.norm(x.transpose(1, 2), cond_embedding_id=bandwidth_id)
+        else:
+            x = self.norm(x.transpose(1, 2))
+        x = x.transpose(1, 2)
+        for conv_block in self.convnext:
+            x = conv_block(x, cond_embedding_id=bandwidth_id)
+        x = self.final_layer_norm(x.transpose(1, 2))
+        return x
+class VocosResNetBackbone(Backbone):
+    """
+    Vocos backbone module built with ResBlocks.
+    Args:
+        input_channels (int): Number of input features channels.
+        dim (int): Hidden dimension of the model.
+        num_blocks (int): Number of ResBlock1 blocks.
+        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to None.
+    """
+    def __init__(
+        self,
+        input_channels,
+        dim,
+        num_blocks,
+        layer_scale_init_value=None,
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.embed = weight_norm(
+            nn.Conv1d(input_channels, dim, kernel_size=3, padding=1)
+        )
+        layer_scale_init_value = layer_scale_init_value or 1 / num_blocks / 3
+        self.resnet = nn.Sequential(
+            *[
+                ResBlock1(dim=dim, layer_scale_init_value=layer_scale_init_value)
+                for _ in range(num_blocks)
+            ]
+        )
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        x = self.embed(x)
+        x = self.resnet(x)
+        x = x.transpose(1, 2)
+        return x
+class Vocos(nn.Module):
+    def __init__(
+        self,
+        input_channels: int = 256,
+        dim: int = 384,
+        intermediate_dim: int = 1152,
+        num_layers: int = 8,
+        n_fft: int = 800,
+        hop_size: int = 200,
+        padding: str = "same",
+        adanorm_num_embeddings=None,
+        cfg=None,
+    ):
+        super().__init__()
+        input_channels = (
+            cfg.input_channels
+            if cfg is not None and hasattr(cfg, "input_channels")
+            else input_channels
+        )
+        dim = cfg.dim if cfg is not None and hasattr(cfg, "dim") else dim
+        intermediate_dim = (
+            cfg.intermediate_dim
+            if cfg is not None and hasattr(cfg, "intermediate_dim")
+            else intermediate_dim
+        )
+        num_layers = (
+            cfg.num_layers
+            if cfg is not None and hasattr(cfg, "num_layers")
+            else num_layers
+        )
+        adanorm_num_embeddings = (
+            cfg.adanorm_num_embeddings
+            if cfg is not None and hasattr(cfg, "adanorm_num_embeddings")
+            else adanorm_num_embeddings
+        )
+        n_fft = cfg.n_fft if cfg is not None and hasattr(cfg, "n_fft") else n_fft
+        hop_size = (
+            cfg.hop_size if cfg is not None and hasattr(cfg, "hop_size") else hop_size
+        )
+        padding = (
+            cfg.padding if cfg is not None and hasattr(cfg, "padding") else padding
+        )
+        self.backbone = VocosBackbone(
+            input_channels=input_channels,
+            dim=dim,
+            intermediate_dim=intermediate_dim,
+            num_layers=num_layers,
+            adanorm_num_embeddings=adanorm_num_embeddings,
+        )
+        self.head = ISTFTHead(dim, n_fft, hop_size, padding)
+    def forward(self, x):
+        x = self.backbone(x)
+        x = self.head(x)
+        return x[:, None, :]

models/codec/coco/rep_coco_model.py ADDED Viewed

	@@ -0,0 +1,441 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from concurrent.futures import ALL_COMPLETED
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from models.codec.amphion_codec.quantize import ResidualVQ
+from models.codec.amphion_codec.vocos import VocosBackbone
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+    if isinstance(m, nn.Linear):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+def compute_codebook_perplexity(indices, codebook_size):
+    indices = indices.flatten()
+    prob = torch.bincount(indices, minlength=codebook_size).float() / indices.size(0)
+    perp = torch.exp(-torch.sum(prob * torch.log(prob + 1e-10)))
+    return perp
+class CocoContentStyle(nn.Module):
+    def __init__(
+        self,
+        codebook_size=8192,
+        hidden_size=1024,
+        codebook_dim=8,
+        num_quantizers=1,
+        quantizer_type="fvq",
+        use_whisper=True,
+        use_chromagram=True,
+        construct_only_for_quantizer=False,
+        cfg=None,
+    ):
+        super().__init__()
+        assert cfg is not None
+        self.cfg = cfg
+        codebook_size = getattr(cfg, "codebook_size", codebook_size)
+        hidden_size = getattr(cfg, "hidden_size", hidden_size)
+        codebook_dim = getattr(cfg, "codebook_dim", codebook_dim)
+        num_quantizers = getattr(cfg, "num_quantizers", num_quantizers)
+        quantizer_type = getattr(cfg, "quantizer_type", quantizer_type)
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.hidden_size = hidden_size
+        self.num_quantizers = num_quantizers
+        self.quantizer_type = quantizer_type
+        if use_whisper:
+            self.whisper_input_layer = nn.Linear(self.cfg.whisper_dim, hidden_size)
+        if use_chromagram:
+            self.chromagram_input_layer = nn.Linear(
+                self.cfg.chromagram_dim, hidden_size
+            )
+        downsample_rate = getattr(cfg, "downsample_rate", 1)
+        if downsample_rate > 1:
+            self.do_downsample = True
+            assert np.log2(downsample_rate).is_integer()
+            down_layers = []
+            up_layers = []
+            for _ in range(int(np.log2(downsample_rate))):
+                down_layers.extend(
+                    [
+                        nn.Conv1d(
+                            hidden_size,
+                            hidden_size,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                        ),
+                        nn.GELU(),
+                    ]
+                )
+                up_layers.extend(
+                    [
+                        nn.ConvTranspose1d(
+                            hidden_size, hidden_size, kernel_size=4, stride=2, padding=1
+                        ),
+                        nn.GELU(),
+                    ]
+                )
+            self.downsample_layers = nn.Sequential(*down_layers)
+            self.upsample_layers = nn.Sequential(*up_layers)
+        else:
+            self.do_downsample = False
+        self.encoder = nn.Sequential(
+            VocosBackbone(
+                input_channels=self.hidden_size,
+                dim=self.cfg.encoder.vocos_dim,
+                intermediate_dim=self.cfg.encoder.vocos_intermediate_dim,
+                num_layers=self.cfg.encoder.vocos_num_layers,
+                adanorm_num_embeddings=None,
+            ),
+            nn.Linear(self.cfg.encoder.vocos_dim, self.hidden_size),
+        )
+        self.quantizer = ResidualVQ(
+            input_dim=hidden_size,
+            num_quantizers=num_quantizers,
+            codebook_size=codebook_size,
+            codebook_dim=codebook_dim,
+            quantizer_type=quantizer_type,
+            quantizer_dropout=0.0,
+            commitment=0.15,
+            codebook_loss_weight=1.0,
+            use_l2_normlize=True,
+        )
+        if not construct_only_for_quantizer:
+            self.decoder = nn.Sequential(
+                VocosBackbone(
+                    input_channels=self.hidden_size,
+                    dim=self.cfg.decoder.vocos_dim,
+                    intermediate_dim=self.cfg.decoder.vocos_intermediate_dim,
+                    num_layers=self.cfg.decoder.vocos_num_layers,
+                    adanorm_num_embeddings=None,
+                ),
+                nn.Linear(self.cfg.decoder.vocos_dim, self.hidden_size),
+            )
+            if use_whisper:
+                self.whisper_output_layer = nn.Linear(
+                    self.hidden_size, self.cfg.whisper_dim
+                )
+            if use_chromagram:
+                self.chromagram_output_layer = nn.Linear(
+                    self.hidden_size, self.cfg.chromagram_dim
+                )
+        self.reset_parameters()
+    def forward(
+        self,
+        whisper_feats,
+        chromagram_feats,
+        return_for_quantizer=False,
+    ):
+        """
+        Args:
+            whisper_feats: [B, T, 1024]
+            chromagram_feats: [B, T, 24]
+        Returns:
+            whisper_rec: [B, T, 1024]
+            chromagram_rec: [B, T, 24]
+            codebook_loss: float
+            all_indices: [N, B, T] or [B, T] if num_of_quantizers == 1
+        """
+        T = whisper_feats.shape[1]
+        # [B, T, D]
+        x = self.whisper_input_layer(whisper_feats) + self.chromagram_input_layer(
+            chromagram_feats
+        )
+        # print("Before downsample:", x.shape)
+        # ====== Downsample ======
+        if self.do_downsample:
+            x = self.downsample_layers(x.transpose(1, 2)).transpose(1, 2)
+        # print("After downsample:", x.shape)
+        # ====== Encoder ======
+        x = self.encoder(x.transpose(1, 2)).transpose(1, 2)  # [B, T, D] -> [B, D, T]
+        # ====== Quantizer ======
+        (
+            quantized_out,  # [B, D, T]
+            all_indices,  # [num_of_quantizers, B, T]
+            all_commit_losses,  # [num_of_quantizers]
+            all_codebook_losses,  # [num_of_quantizers]
+            _,
+        ) = self.quantizer(x)
+        if return_for_quantizer:
+            if all_indices.shape[0] == 1:
+                return all_indices.squeeze(0), quantized_out.transpose(1, 2)
+            return all_indices, quantized_out.transpose(1, 2)
+        # ====== Decoder ======
+        x_rec = self.decoder(quantized_out)  # [B, T, D]
+        # ====== Upsample ======
+        if self.do_downsample:
+            x_rec = self.upsample_layers(x_rec.transpose(1, 2)).transpose(1, 2)
+        # print("After upsample:", x_rec.shape)
+        # Ensure output dimensions match input
+        if x_rec.shape[1] >= T:  # Check time dimension
+            x_rec = x_rec[:, :T, :]
+        else:
+            padding_frames = T - x_rec.shape[1]
+            last_frame = x_rec[:, -1:, :]
+            padding = last_frame.repeat(1, padding_frames, 1)
+            x_rec = torch.cat([x_rec, padding], dim=1)
+        # ====== Loss ======
+        whisper_rec = self.whisper_output_layer(x_rec)  # [B, T, 1024]
+        chromagram_rec = self.chromagram_output_layer(x_rec)  # [B, T, 24]
+        codebook_loss = (all_codebook_losses + all_commit_losses).mean()
+        all_indices = all_indices
+        return whisper_rec, chromagram_rec, codebook_loss, all_indices
+    def quantize(self, whisper_feats, chromagram_feats):
+        """
+        Args:
+            whisper_feats: [B, T, 1024]
+            chromagram_feats: [B, T, 24]
+        Returns:
+            all_indices: [N, B, T], or [B, T] if num_of_quantizers == 1
+            quantized_out: [B, D, T]
+        """
+        all_indices, quantized_out = self.forward(
+            whisper_feats,
+            chromagram_feats,
+            return_for_quantizer=True,
+        )
+        return all_indices, quantized_out
+    def reset_parameters(self):
+        self.apply(init_weights)
+class CocoContent(CocoContentStyle):
+    def __init__(
+        self,
+        cfg,
+        use_whisper=True,
+        use_chromagram=False,
+        construct_only_for_quantizer=False,
+    ):
+        super().__init__(
+            cfg=cfg,
+            use_whisper=use_whisper,
+            use_chromagram=use_chromagram,
+            construct_only_for_quantizer=construct_only_for_quantizer,
+        )
+    def forward(
+        self,
+        whisper_feats,
+        return_for_quantizer=False,
+    ):
+        """
+        Args:
+            whisper_feats: [B, T, 1024]
+        Returns:
+            whisper_rec: [B, T, 1024]
+            codebook_loss: float
+            all_indices: [N, B, T]
+        """
+        T = whisper_feats.shape[1]
+        # [B, T, D]
+        x = self.whisper_input_layer(whisper_feats)
+        # ====== Downsample ======
+        if self.do_downsample:
+            x = self.downsample_layers(x.transpose(1, 2)).transpose(1, 2)
+        # ====== Encoder ======
+        x = self.encoder(x.transpose(1, 2)).transpose(1, 2)  # [B, T, D] -> [B, D, T]
+        # ====== Quantizer ======
+        (
+            quantized_out,  # [B, D, T]
+            all_indices,  # [num_of_quantizers, B, T]
+            all_commit_losses,  # [num_of_quantizers]
+            all_codebook_losses,  # [num_of_quantizers]
+            _,
+        ) = self.quantizer(x)
+        if return_for_quantizer:
+            if all_indices.shape[0] == 1:
+                return all_indices.squeeze(0), quantized_out.transpose(1, 2)
+            return all_indices, quantized_out.transpose(1, 2)
+        # ====== Decoder ======
+        x_rec = self.decoder(quantized_out)  # [B, T, D]
+        # ====== Upsample ======
+        if self.do_downsample:
+            x_rec = self.upsample_layers(x_rec.transpose(1, 2)).transpose(1, 2)
+        # Ensure output dimensions match input
+        if x_rec.shape[1] >= T:  # Check time dimension
+            x_rec = x_rec[:, :T, :]
+        else:
+            padding_frames = T - x_rec.shape[1]
+            last_frame = x_rec[:, -1:, :]
+            padding = last_frame.repeat(1, padding_frames, 1)
+            x_rec = torch.cat([x_rec, padding], dim=1)
+        # ====== Loss ======
+        whisper_rec = self.whisper_output_layer(x_rec)  # [B, T, 1024]
+        codebook_loss = (all_codebook_losses + all_commit_losses).mean()
+        all_indices = all_indices
+        return whisper_rec, codebook_loss, all_indices
+    def quantize(self, whisper_feats):
+        all_indices, quantized_out = self.forward(
+            whisper_feats, return_for_quantizer=True
+        )
+        return all_indices, quantized_out
+class CocoStyle(CocoContentStyle):
+    def __init__(
+        self,
+        cfg,
+        use_whisper=False,
+        use_chromagram=True,
+        construct_only_for_quantizer=False,
+    ):
+        super().__init__(
+            cfg=cfg,
+            use_whisper=use_whisper,
+            use_chromagram=use_chromagram,
+            construct_only_for_quantizer=construct_only_for_quantizer,
+        )
+    def forward(
+        self,
+        chromagram_feats,
+        return_for_quantizer=False,
+    ):
+        """
+        Args:
+            chromagram_feats: [B, T, 24]
+        Returns:
+            chromagram_rec: [B, T, 24]
+            codebook_loss: float
+            all_indices: [N, B, T]
+        """
+        T = chromagram_feats.shape[1]
+        # [B, T, D]
+        x = self.chromagram_input_layer(chromagram_feats)
+        # ====== Downsample ======
+        if self.do_downsample:
+            x = self.downsample_layers(x.transpose(1, 2)).transpose(1, 2)
+        # ====== Encoder ======
+        x = self.encoder(x.transpose(1, 2)).transpose(1, 2)  # [B, T, D] -> [B, D, T]
+        # ====== Quantizer ======
+        (
+            quantized_out,  # [B, D, T]
+            all_indices,  # [num_of_quantizers, B, T]
+            all_commit_losses,  # [num_of_quantizers]
+            all_codebook_losses,  # [num_of_quantizers]
+            _,
+        ) = self.quantizer(x)
+        if return_for_quantizer:
+            if all_indices.shape[0] == 1:
+                return all_indices.squeeze(0), quantized_out.transpose(1, 2)
+            return all_indices, quantized_out.transpose(1, 2)
+        # ====== Decoder ======
+        x_rec = self.decoder(quantized_out)  # [B, T, D]
+        # ====== Upsample ======
+        if self.do_downsample:
+            x_rec = self.upsample_layers(x_rec.transpose(1, 2)).transpose(1, 2)
+        # Ensure output dimensions match input
+        if x_rec.shape[1] >= T:  # Check time dimension
+            x_rec = x_rec[:, :T, :]
+        else:
+            padding_frames = T - x_rec.shape[1]
+            last_frame = x_rec[:, -1:, :]
+            padding = last_frame.repeat(1, padding_frames, 1)
+            x_rec = torch.cat([x_rec, padding], dim=1)
+        # ====== Loss ======
+        chromagram_rec = self.chromagram_output_layer(x_rec)  # [B, T, 24]
+        codebook_loss = (all_codebook_losses + all_commit_losses).mean()
+        all_indices = all_indices
+        return chromagram_rec, codebook_loss, all_indices
+    def quantize(self, chromagram_feats):
+        all_indices, quantized_out = self.forward(
+            chromagram_feats, return_for_quantizer=True
+        )
+        return all_indices, quantized_out
+# if __name__ == "__main__":
+#     from utils.util import JsonHParams
+#     cfg = JsonHParams(
+#         **{
+#             "whisper_dim": 1024,
+#             "chromagram_dim": 24,
+#             "global_speaker_encoder": {
+#                 "input_dim": 128,  # Eg: n_mels
+#                 "hidden_size": 512,  # 768 for emilia298k
+#                 "num_hidden_layers": 4,  # 6 for emilia298k
+#                 "num_attention_heads": 8,
+#             },
+#         }
+#     )
+#     model = Coco(cfg=cfg)
+#     x = torch.randn(2, 150, 1024)
+#     tone_height = torch.randn(2)
+#     mels = torch.randn(2, 150, 128)
+#     mel_mask = torch.ones(2, 150)
+#     x_rec, codebook_loss, all_indices, auxillary_pred_outputs = model(
+#         x, tone_height, mels, mel_mask
+#     )
+#     print(x_rec.shape, codebook_loss, all_indices.shape)
+#     for k, v in auxillary_pred_outputs.items():
+#         print(k, v.shape)

models/codec/melvqgan/melspec.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import pyworld as pw
+import numpy as np
+import soundfile as sf
+import os
+from torchaudio.functional import pitch_shift
+import librosa
+from librosa.filters import mel as librosa_mel_fn
+import torch.nn as nn
+import torch.nn.functional as F
+import tqdm
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+class MelSpectrogram(nn.Module):
+    def __init__(
+        self,
+        n_fft,
+        num_mels,
+        sampling_rate,
+        hop_size,
+        win_size,
+        fmin,
+        fmax,
+        center=False,
+    ):
+        super(MelSpectrogram, self).__init__()
+        self.n_fft = n_fft
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.sampling_rate = sampling_rate
+        self.num_mels = num_mels
+        self.fmin = fmin
+        self.fmax = fmax
+        self.center = center
+        mel_basis = {}
+        hann_window = {}
+        mel = librosa_mel_fn(
+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
+        )
+        mel_basis = torch.from_numpy(mel).float()
+        hann_window = torch.hann_window(win_size)
+        self.register_buffer("mel_basis", mel_basis)
+        self.register_buffer("hann_window", hann_window)
+    def forward(self, y):
+        y = torch.nn.functional.pad(
+            y.unsqueeze(1),
+            (
+                int((self.n_fft - self.hop_size) / 2),
+                int((self.n_fft - self.hop_size) / 2),
+            ),
+            mode="reflect",
+        )
+        y = y.squeeze(1)
+        spec = torch.stft(
+            y,
+            self.n_fft,
+            hop_length=self.hop_size,
+            win_length=self.win_size,
+            window=self.hann_window,
+            center=self.center,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)
+        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+        spec = torch.matmul(self.mel_basis, spec)
+        spec = spectral_normalize_torch(spec)
+        return spec

utils/__init__.py ADDED Viewed

File without changes

utils/hparam.py ADDED Viewed

	@@ -0,0 +1,659 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is modified from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/training/python/training/hparam.py  pylint: disable=line-too-long
+"""Hyperparameter values."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import numbers
+import re
+import six
+# Define the regular expression for parsing a single clause of the input
+# (delimited by commas).  A legal clause looks like:
+#   <variable name>[<index>]? = <rhs>
+# where <rhs> is either a single token or [] enclosed list of tokens.
+# For example:  "var[1] = a" or "x = [1,2,3]"
+PARAM_RE = re.compile(
+    r"""
+  (?P<name>[a-zA-Z][\w\.]*)      # variable name: "var" or "x"
+  (\[\s*(?P<index>\d+)\s*\])?  # (optional) index: "1" or None
+  \s*=\s*
+  ((?P<val>[^,\[]*)            # single value: "a" or None
+   |
+   \[(?P<vals>[^\]]*)\])       # list of values: None or "1,2,3"
+  ($|,\s*)""",
+    re.VERBOSE,
+)
+def _parse_fail(name, var_type, value, values):
+    """Helper function for raising a value error for bad assignment."""
+    raise ValueError(
+        "Could not parse hparam '%s' of type '%s' with value '%s' in %s"
+        % (name, var_type.__name__, value, values)
+    )
+def _reuse_fail(name, values):
+    """Helper function for raising a value error for reuse of name."""
+    raise ValueError("Multiple assignments to variable '%s' in %s" % (name, values))
+def _process_scalar_value(name, parse_fn, var_type, m_dict, values, results_dictionary):
+    """Update results_dictionary with a scalar value.
+    Used to update the results_dictionary to be returned by parse_values when
+    encountering a clause with a scalar RHS (e.g.  "s=5" or "arr[0]=5".)
+    Mutates results_dictionary.
+    Args:
+      name: Name of variable in assignment ("s" or "arr").
+      parse_fn: Function for parsing the actual value.
+      var_type: Type of named variable.
+      m_dict: Dictionary constructed from regex parsing.
+        m_dict['val']: RHS value (scalar)
+        m_dict['index']: List index value (or None)
+      values: Full expression being parsed
+      results_dictionary: The dictionary being updated for return by the parsing
+        function.
+    Raises:
+      ValueError: If the name has already been used.
+    """
+    try:
+        parsed_value = parse_fn(m_dict["val"])
+    except ValueError:
+        _parse_fail(name, var_type, m_dict["val"], values)
+    # If no index is provided
+    if not m_dict["index"]:
+        if name in results_dictionary:
+            _reuse_fail(name, values)
+        results_dictionary[name] = parsed_value
+    else:
+        if name in results_dictionary:
+            # The name has already been used as a scalar, then it
+            # will be in this dictionary and map to a non-dictionary.
+            if not isinstance(results_dictionary.get(name), dict):
+                _reuse_fail(name, values)
+        else:
+            results_dictionary[name] = {}
+        index = int(m_dict["index"])
+        # Make sure the index position hasn't already been assigned a value.
+        if index in results_dictionary[name]:
+            _reuse_fail("{}[{}]".format(name, index), values)
+        results_dictionary[name][index] = parsed_value
+def _process_list_value(name, parse_fn, var_type, m_dict, values, results_dictionary):
+    """Update results_dictionary from a list of values.
+    Used to update results_dictionary to be returned by parse_values when
+    encountering a clause with a list RHS (e.g.  "arr=[1,2,3]".)
+    Mutates results_dictionary.
+    Args:
+      name: Name of variable in assignment ("arr").
+      parse_fn: Function for parsing individual values.
+      var_type: Type of named variable.
+      m_dict: Dictionary constructed from regex parsing.
+        m_dict['val']: RHS value (scalar)
+      values: Full expression being parsed
+      results_dictionary: The dictionary being updated for return by the parsing
+        function.
+    Raises:
+      ValueError: If the name has an index or the values cannot be parsed.
+    """
+    if m_dict["index"] is not None:
+        raise ValueError("Assignment of a list to a list index.")
+    elements = filter(None, re.split("[ ,]", m_dict["vals"]))
+    # Make sure the name hasn't already been assigned a value
+    if name in results_dictionary:
+        raise _reuse_fail(name, values)
+    try:
+        results_dictionary[name] = [parse_fn(e) for e in elements]
+    except ValueError:
+        _parse_fail(name, var_type, m_dict["vals"], values)
+def _cast_to_type_if_compatible(name, param_type, value):
+    """Cast hparam to the provided type, if compatible.
+    Args:
+      name: Name of the hparam to be cast.
+      param_type: The type of the hparam.
+      value: The value to be cast, if compatible.
+    Returns:
+      The result of casting `value` to `param_type`.
+    Raises:
+      ValueError: If the type of `value` is not compatible with param_type.
+        * If `param_type` is a string type, but `value` is not.
+        * If `param_type` is a boolean, but `value` is not, or vice versa.
+        * If `param_type` is an integer type, but `value` is not.
+        * If `param_type` is a float type, but `value` is not a numeric type.
+    """
+    fail_msg = "Could not cast hparam '%s' of type '%s' from value %r" % (
+        name,
+        param_type,
+        value,
+    )
+    # Some callers use None, for which we can't do any casting/checking. :(
+    if issubclass(param_type, type(None)):
+        return value
+    # Avoid converting a non-string type to a string.
+    if issubclass(param_type, (six.string_types, six.binary_type)) and not isinstance(
+        value, (six.string_types, six.binary_type)
+    ):
+        raise ValueError(fail_msg)
+    # Avoid converting a number or string type to a boolean or vice versa.
+    if issubclass(param_type, bool) != isinstance(value, bool):
+        raise ValueError(fail_msg)
+    # Avoid converting float to an integer (the reverse is fine).
+    if issubclass(param_type, numbers.Integral) and not isinstance(
+        value, numbers.Integral
+    ):
+        raise ValueError(fail_msg)
+    # Avoid converting a non-numeric type to a numeric type.
+    if issubclass(param_type, numbers.Number) and not isinstance(value, numbers.Number):
+        raise ValueError(fail_msg)
+    return param_type(value)
+def parse_values(values, type_map, ignore_unknown=False):
+    """Parses hyperparameter values from a string into a python map.
+    `values` is a string containing comma-separated `name=value` pairs.
+    For each pair, the value of the hyperparameter named `name` is set to
+    `value`.
+    If a hyperparameter name appears multiple times in `values`, a ValueError
+    is raised (e.g. 'a=1,a=2', 'a[1]=1,a[1]=2').
+    If a hyperparameter name in both an index assignment and scalar assignment,
+    a ValueError is raised.  (e.g. 'a=[1,2,3],a[0] = 1').
+    The hyperparameter name may contain '.' symbols, which will result in an
+    attribute name that is only accessible through the getattr and setattr
+    functions.  (And must be first explicit added through add_hparam.)
+    WARNING: Use of '.' in your variable names is allowed, but is not well
+    supported and not recommended.
+    The `value` in `name=value` must follows the syntax according to the
+    type of the parameter:
+    *  Scalar integer: A Python-parsable integer point value.  E.g.: 1,
+       100, -12.
+    *  Scalar float: A Python-parsable floating point value.  E.g.: 1.0,
+       -.54e89.
+    *  Boolean: Either true or false.
+    *  Scalar string: A non-empty sequence of characters, excluding comma,
+       spaces, and square brackets.  E.g.: foo, bar_1.
+    *  List: A comma separated list of scalar values of the parameter type
+       enclosed in square brackets.  E.g.: [1,2,3], [1.0,1e-12], [high,low].
+    When index assignment is used, the corresponding type_map key should be the
+    list name.  E.g. for "arr[1]=0" the type_map must have the key "arr" (not
+    "arr[1]").
+    Args:
+      values: String.  Comma separated list of `name=value` pairs where
+        'value' must follow the syntax described above.
+      type_map: A dictionary mapping hyperparameter names to types.  Note every
+        parameter name in values must be a key in type_map.  The values must
+        conform to the types indicated, where a value V is said to conform to a
+        type T if either V has type T, or V is a list of elements of type T.
+        Hence, for a multidimensional parameter 'x' taking float values,
+        'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
+      ignore_unknown: Bool. Whether values that are missing a type in type_map
+        should be ignored. If set to True, a ValueError will not be raised for
+        unknown hyperparameter type.
+    Returns:
+      A python map mapping each name to either:
+      * A scalar value.
+      * A list of scalar values.
+      * A dictionary mapping index numbers to scalar values.
+      (e.g. "x=5,L=[1,2],arr[1]=3" results in {'x':5,'L':[1,2],'arr':{1:3}}")
+    Raises:
+      ValueError: If there is a problem with input.
+      * If `values` cannot be parsed.
+      * If a list is assigned to a list index (e.g. 'a[1] = [1,2,3]').
+      * If the same rvalue is assigned two different values (e.g. 'a=1,a=2',
+        'a[1]=1,a[1]=2', or 'a=1,a=[1]')
+    """
+    results_dictionary = {}
+    pos = 0
+    while pos < len(values):
+        m = PARAM_RE.match(values, pos)
+        if not m:
+            raise ValueError("Malformed hyperparameter value: %s" % values[pos:])
+        # Check that there is a comma between parameters and move past it.
+        pos = m.end()
+        # Parse the values.
+        m_dict = m.groupdict()
+        name = m_dict["name"]
+        if name not in type_map:
+            if ignore_unknown:
+                continue
+            raise ValueError("Unknown hyperparameter type for %s" % name)
+        type_ = type_map[name]
+        # Set up correct parsing function (depending on whether type_ is a bool)
+        if type_ == bool:
+            def parse_bool(value):
+                if value in ["true", "True"]:
+                    return True
+                elif value in ["false", "False"]:
+                    return False
+                else:
+                    try:
+                        return bool(int(value))
+                    except ValueError:
+                        _parse_fail(name, type_, value, values)
+            parse = parse_bool
+        else:
+            parse = type_
+        # If a singe value is provided
+        if m_dict["val"] is not None:
+            _process_scalar_value(
+                name, parse, type_, m_dict, values, results_dictionary
+            )
+        # If the assigned value is a list:
+        elif m_dict["vals"] is not None:
+            _process_list_value(name, parse, type_, m_dict, values, results_dictionary)
+        else:  # Not assigned a list or value
+            _parse_fail(name, type_, "", values)
+    return results_dictionary
+class HParams(object):
+    """Class to hold a set of hyperparameters as name-value pairs.
+    A `HParams` object holds hyperparameters used to build and train a model,
+    such as the number of hidden units in a neural net layer or the learning rate
+    to use when training.
+    You first create a `HParams` object by specifying the names and values of the
+    hyperparameters.
+    To make them easily accessible the parameter names are added as direct
+    attributes of the class.  A typical usage is as follows:
+    ```python
+    # Create a HParams object specifying names and values of the model
+    # hyperparameters:
+    hparams = HParams(learning_rate=0.1, num_hidden_units=100)
+    # The hyperparameter are available as attributes of the HParams object:
+    hparams.learning_rate ==> 0.1
+    hparams.num_hidden_units ==> 100
+    ```
+    Hyperparameters have type, which is inferred from the type of their value
+    passed at construction type.   The currently supported types are: integer,
+    float, boolean, string, and list of integer, float, boolean, or string.
+    You can override hyperparameter values by calling the
+    [`parse()`](#HParams.parse) method, passing a string of comma separated
+    `name=value` pairs.  This is intended to make it possible to override
+    any hyperparameter values from a single command-line flag to which
+    the user passes 'hyper-param=value' pairs.  It avoids having to define
+    one flag for each hyperparameter.
+    The syntax expected for each value depends on the type of the parameter.
+    See `parse()` for a description of the syntax.
+    Example:
+    ```python
+    # Define a command line flag to pass name=value pairs.
+    # For example using argparse:
+    import argparse
+    parser = argparse.ArgumentParser(description='Train my model.')
+    parser.add_argument('--hparams', type=str,
+                        help='Comma separated list of "name=value" pairs.')
+    args = parser.parse_args()
+    ...
+    def my_program():
+      # Create a HParams object specifying the names and values of the
+      # model hyperparameters:
+      hparams = tf.HParams(learning_rate=0.1, num_hidden_units=100,
+                           activations=['relu', 'tanh'])
+      # Override hyperparameters values by parsing the command line
+      hparams.parse(args.hparams)
+      # If the user passed `--hparams=learning_rate=0.3` on the command line
+      # then 'hparams' has the following attributes:
+      hparams.learning_rate ==> 0.3
+      hparams.num_hidden_units ==> 100
+      hparams.activations ==> ['relu', 'tanh']
+      # If the hyperparameters are in json format use parse_json:
+      hparams.parse_json('{"learning_rate": 0.3, "activations": "relu"}')
+    ```
+    """
+    _HAS_DYNAMIC_ATTRIBUTES = True  # Required for pytype checks.
+    def __init__(self, model_structure=None, **kwargs):
+        """Create an instance of `HParams` from keyword arguments.
+        The keyword arguments specify name-values pairs for the hyperparameters.
+        The parameter types are inferred from the type of the values passed.
+        The parameter names are added as attributes of `HParams` object, so they
+        can be accessed directly with the dot notation `hparams._name_`.
+        Example:
+        ```python
+        # Define 3 hyperparameters: 'learning_rate' is a float parameter,
+        # 'num_hidden_units' an integer parameter, and 'activation' a string
+        # parameter.
+        hparams = tf.HParams(
+            learning_rate=0.1, num_hidden_units=100, activation='relu')
+        hparams.activation ==> 'relu'
+        ```
+        Note that a few names are reserved and cannot be used as hyperparameter
+        names.  If you use one of the reserved name the constructor raises a
+        `ValueError`.
+        Args:
+          model_structure: An instance of ModelStructure, defining the feature
+            crosses to be used in the Trial.
+          **kwargs: Key-value pairs where the key is the hyperparameter name and
+            the value is the value for the parameter.
+        Raises:
+          ValueError: If both `hparam_def` and initialization values are provided,
+            or if one of the arguments is invalid.
+        """
+        # Register the hyperparameters and their type in _hparam_types.
+        # This simplifies the implementation of parse().
+        # _hparam_types maps the parameter name to a tuple (type, bool).
+        # The type value is the type of the parameter for scalar hyperparameters,
+        # or the type of the list elements for multidimensional hyperparameters.
+        # The bool value is True if the value is a list, False otherwise.
+        self._hparam_types = {}
+        self._model_structure = model_structure
+        for name, value in six.iteritems(kwargs):
+            self.add_hparam(name, value)
+    def add_hparam(self, name, value):
+        """Adds {name, value} pair to hyperparameters.
+        Args:
+          name: Name of the hyperparameter.
+          value: Value of the hyperparameter. Can be one of the following types:
+            int, float, string, int list, float list, or string list.
+        Raises:
+          ValueError: if one of the arguments is invalid.
+        """
+        # Keys in kwargs are unique, but 'name' could the name of a pre-existing
+        # attribute of this object.  In that case we refuse to use it as a
+        # hyperparameter name.
+        if getattr(self, name, None) is not None:
+            raise ValueError("Hyperparameter name is reserved: %s" % name)
+        if isinstance(value, (list, tuple)):
+            if not value:
+                raise ValueError(
+                    "Multi-valued hyperparameters cannot be empty: %s" % name
+                )
+            self._hparam_types[name] = (type(value[0]), True)
+        else:
+            self._hparam_types[name] = (type(value), False)
+        setattr(self, name, value)
+    def set_hparam(self, name, value):
+        """Set the value of an existing hyperparameter.
+        This function verifies that the type of the value matches the type of the
+        existing hyperparameter.
+        Args:
+          name: Name of the hyperparameter.
+          value: New value of the hyperparameter.
+        Raises:
+          KeyError: If the hyperparameter doesn't exist.
+          ValueError: If there is a type mismatch.
+        """
+        param_type, is_list = self._hparam_types[name]
+        if isinstance(value, list):
+            if not is_list:
+                raise ValueError(
+                    "Must not pass a list for single-valued parameter: %s" % name
+                )
+            setattr(
+                self,
+                name,
+                [_cast_to_type_if_compatible(name, param_type, v) for v in value],
+            )
+        else:
+            if is_list:
+                raise ValueError(
+                    "Must pass a list for multi-valued parameter: %s." % name
+                )
+            setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
+    def del_hparam(self, name):
+        """Removes the hyperparameter with key 'name'.
+        Does nothing if it isn't present.
+        Args:
+          name: Name of the hyperparameter.
+        """
+        if hasattr(self, name):
+            delattr(self, name)
+            del self._hparam_types[name]
+    def parse(self, values):
+        """Override existing hyperparameter values, parsing new values from a string.
+        See parse_values for more detail on the allowed format for values.
+        Args:
+          values: String.  Comma separated list of `name=value` pairs where 'value'
+            must follow the syntax described above.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          ValueError: If `values` cannot be parsed or a hyperparameter in `values`
+          doesn't exist.
+        """
+        type_map = {}
+        for name, t in self._hparam_types.items():
+            param_type, _ = t
+            type_map[name] = param_type
+        values_map = parse_values(values, type_map)
+        return self.override_from_dict(values_map)
+    def override_from_dict(self, values_dict):
+        """Override existing hyperparameter values, parsing new values from a dictionary.
+        Args:
+          values_dict: Dictionary of name:value pairs.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          KeyError: If a hyperparameter in `values_dict` doesn't exist.
+          ValueError: If `values_dict` cannot be parsed.
+        """
+        for name, value in values_dict.items():
+            self.set_hparam(name, value)
+        return self
+    def set_model_structure(self, model_structure):
+        self._model_structure = model_structure
+    def get_model_structure(self):
+        return self._model_structure
+    def to_json(self, indent=None, separators=None, sort_keys=False):
+        """Serializes the hyperparameters into JSON.
+        Args:
+          indent: If a non-negative integer, JSON array elements and object members
+            will be pretty-printed with that indent level. An indent level of 0, or
+            negative, will only insert newlines. `None` (the default) selects the
+            most compact representation.
+          separators: Optional `(item_separator, key_separator)` tuple. Default is
+            `(', ', ': ')`.
+          sort_keys: If `True`, the output dictionaries will be sorted by key.
+        Returns:
+          A JSON string.
+        """
+        def remove_callables(x):
+            """Omit callable elements from input with arbitrary nesting."""
+            if isinstance(x, dict):
+                return {
+                    k: remove_callables(v)
+                    for k, v in six.iteritems(x)
+                    if not callable(v)
+                }
+            elif isinstance(x, list):
+                return [remove_callables(i) for i in x if not callable(i)]
+            return x
+        return json.dumps(
+            remove_callables(self.values()),
+            indent=indent,
+            separators=separators,
+            sort_keys=sort_keys,
+        )
+    def parse_json(self, values_json):
+        """Override existing hyperparameter values, parsing new values from a json object.
+        Args:
+          values_json: String containing a json object of name:value pairs.
+        Returns:
+          The `HParams` instance.
+        Raises:
+          KeyError: If a hyperparameter in `values_json` doesn't exist.
+          ValueError: If `values_json` cannot be parsed.
+        """
+        values_map = json.loads(values_json)
+        return self.override_from_dict(values_map)
+    def values(self):
+        """Return the hyperparameter values as a Python dictionary.
+        Returns:
+          A dictionary with hyperparameter names as keys.  The values are the
+          hyperparameter values.
+        """
+        return {n: getattr(self, n) for n in self._hparam_types.keys()}
+    def get(self, key, default=None):
+        """Returns the value of `key` if it exists, else `default`."""
+        if key in self._hparam_types:
+            # Ensure that default is compatible with the parameter type.
+            if default is not None:
+                param_type, is_param_list = self._hparam_types[key]
+                type_str = "list<%s>" % param_type if is_param_list else str(param_type)
+                fail_msg = (
+                    "Hparam '%s' of type '%s' is incompatible with "
+                    "default=%s" % (key, type_str, default)
+                )
+                is_default_list = isinstance(default, list)
+                if is_param_list != is_default_list:
+                    raise ValueError(fail_msg)
+                try:
+                    if is_default_list:
+                        for value in default:
+                            _cast_to_type_if_compatible(key, param_type, value)
+                    else:
+                        _cast_to_type_if_compatible(key, param_type, default)
+                except ValueError as e:
+                    raise ValueError("%s. %s" % (fail_msg, e))
+            return getattr(self, key)
+        return default
+    def __contains__(self, key):
+        return key in self._hparam_types
+    def __str__(self):
+        return str(sorted(self.values().items()))
+    def __repr__(self):
+        return "%s(%s)" % (type(self).__name__, self.__str__())
+    @staticmethod
+    def _get_kind_name(param_type, is_list):
+        """Returns the field name given parameter type and is_list.
+        Args:
+          param_type: Data type of the hparam.
+          is_list: Whether this is a list.
+        Returns:
+          A string representation of the field name.
+        Raises:
+          ValueError: If parameter type is not recognized.
+        """
+        if issubclass(param_type, bool):
+            # This check must happen before issubclass(param_type, six.integer_types),
+            # since Python considers bool to be a subclass of int.
+            typename = "bool"
+        elif issubclass(param_type, six.integer_types):
+            # Setting 'int' and 'long' types to be 'int64' to ensure the type is
+            # compatible with both Python2 and Python3.
+            typename = "int64"
+        elif issubclass(param_type, (six.string_types, six.binary_type)):
+            # Setting 'string' and 'bytes' types to be 'bytes' to ensure the type is
+            # compatible with both Python2 and Python3.
+            typename = "bytes"
+        elif issubclass(param_type, float):
+            typename = "float"
+        else:
+            raise ValueError("Unsupported parameter type: %s" % str(param_type))
+        suffix = "list" if is_list else "value"
+        return "_".join([typename, suffix])

utils/util.py ADDED Viewed

	@@ -0,0 +1,690 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import collections
+import glob
+import os
+import random
+import time
+import argparse
+from collections import OrderedDict
+import json5
+import numpy as np
+import glob
+from torch.nn import functional as F
+try:
+    from ruamel.yaml import YAML as yaml
+except:
+    from ruamel_yaml import YAML as yaml
+import torch
+from utils.hparam import HParams
+import logging
+from logging import handlers
+def str2bool(v):
+    """Used in argparse.ArgumentParser.add_argument to indicate
+    that a type is a bool type and user can enter
+        - yes, true, t, y, 1, to represent True
+        - no, false, f, n, 0, to represent False
+    See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse  # noqa
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def find_checkpoint_of_mapper(mapper_ckpt_dir):
+    mapper_ckpts = glob.glob(os.path.join(mapper_ckpt_dir, "ckpts/*.pt"))
+    # Select the max steps
+    mapper_ckpts.sort()
+    mapper_weights_file = mapper_ckpts[-1]
+    return mapper_weights_file
+def pad_f0_to_tensors(f0s, batched=None):
+    # Initialize
+    tensors = []
+    if batched == None:
+        # Get the max frame for padding
+        size = -1
+        for f0 in f0s:
+            size = max(size, f0.shape[-1])
+        tensor = torch.zeros(len(f0s), size)
+        for i, f0 in enumerate(f0s):
+            tensor[i, : f0.shape[-1]] = f0[:]
+        tensors.append(tensor)
+    else:
+        start = 0
+        while start + batched - 1 < len(f0s):
+            end = start + batched - 1
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end + 1):
+                size = max(size, f0s[i].shape[-1])
+            tensor = torch.zeros(batched, size)
+            for i in range(start, end + 1):
+                tensor[i - start, : f0s[i].shape[-1]] = f0s[i][:]
+            tensors.append(tensor)
+            start = start + batched
+        if start != len(f0s):
+            end = len(f0s)
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end):
+                size = max(size, f0s[i].shape[-1])
+            tensor = torch.zeros(len(f0s) - start, size)
+            for i in range(start, end):
+                tensor[i - start, : f0s[i].shape[-1]] = f0s[i][:]
+            tensors.append(tensor)
+    return tensors
+def pad_mels_to_tensors(mels, batched=None):
+    """
+    Args:
+        mels: A list of mel-specs
+    Returns:
+        tensors: A list of tensors containing the batched mel-specs
+        mel_frames: A list of tensors containing the frames of the original mel-specs
+    """
+    # Initialize
+    tensors = []
+    mel_frames = []
+    # Split mel-specs into batches to avoid cuda memory exceed
+    if batched == None:
+        # Get the max frame for padding
+        size = -1
+        for mel in mels:
+            size = max(size, mel.shape[-1])
+        tensor = torch.zeros(len(mels), mels[0].shape[0], size)
+        mel_frame = torch.zeros(len(mels), dtype=torch.int32)
+        for i, mel in enumerate(mels):
+            tensor[i, :, : mel.shape[-1]] = mel[:]
+            mel_frame[i] = mel.shape[-1]
+        tensors.append(tensor)
+        mel_frames.append(mel_frame)
+    else:
+        start = 0
+        while start + batched - 1 < len(mels):
+            end = start + batched - 1
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end + 1):
+                size = max(size, mels[i].shape[-1])
+            tensor = torch.zeros(batched, mels[0].shape[0], size)
+            mel_frame = torch.zeros(batched, dtype=torch.int32)
+            for i in range(start, end + 1):
+                tensor[i - start, :, : mels[i].shape[-1]] = mels[i][:]
+                mel_frame[i - start] = mels[i].shape[-1]
+            tensors.append(tensor)
+            mel_frames.append(mel_frame)
+            start = start + batched
+        if start != len(mels):
+            end = len(mels)
+            # Get the max frame for padding
+            size = -1
+            for i in range(start, end):
+                size = max(size, mels[i].shape[-1])
+            tensor = torch.zeros(len(mels) - start, mels[0].shape[0], size)
+            mel_frame = torch.zeros(len(mels) - start, dtype=torch.int32)
+            for i in range(start, end):
+                tensor[i - start, :, : mels[i].shape[-1]] = mels[i][:]
+                mel_frame[i - start] = mels[i].shape[-1]
+            tensors.append(tensor)
+            mel_frames.append(mel_frame)
+    return tensors, mel_frames
+def load_model_config(args):
+    """Load model configurations (in args.json under checkpoint directory)
+    Args:
+        args (ArgumentParser): arguments to run bins/preprocess.py
+    Returns:
+        dict: dictionary that stores model configurations
+    """
+    if args.checkpoint_dir is None:
+        assert args.checkpoint_file is not None
+        checkpoint_dir = os.path.split(args.checkpoint_file)[0]
+    else:
+        checkpoint_dir = args.checkpoint_dir
+    config_path = os.path.join(checkpoint_dir, "args.json")
+    print("config_path: ", config_path)
+    config = load_config(config_path)
+    return config
+def remove_and_create(dir):
+    if os.path.exists(dir):
+        os.system("rm -r {}".format(dir))
+    os.makedirs(dir, exist_ok=True)
+def has_existed(path, warning=False):
+    if not warning:
+        return os.path.exists(path)
+    if os.path.exists(path):
+        answer = input(
+            "The path {} has existed. \nInput 'y' (or hit Enter) to skip it, and input 'n' to re-write it [y/n]\n".format(
+                path
+            )
+        )
+        if not answer == "n":
+            return True
+    return False
+def remove_older_ckpt(saved_model_name, checkpoint_dir, max_to_keep=5):
+    if os.path.exists(os.path.join(checkpoint_dir, "checkpoint")):
+        with open(os.path.join(checkpoint_dir, "checkpoint"), "r") as f:
+            ckpts = [x.strip() for x in f.readlines()]
+    else:
+        ckpts = []
+    ckpts.append(saved_model_name)
+    for item in ckpts[:-max_to_keep]:
+        if os.path.exists(os.path.join(checkpoint_dir, item)):
+            os.remove(os.path.join(checkpoint_dir, item))
+    with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as f:
+        for item in ckpts[-max_to_keep:]:
+            f.write("{}\n".format(item))
+def set_all_random_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.random.manual_seed(seed)
+def save_checkpoint(
+    args,
+    generator,
+    g_optimizer,
+    step,
+    discriminator=None,
+    d_optimizer=None,
+    max_to_keep=5,
+):
+    saved_model_name = "model.ckpt-{}.pt".format(step)
+    checkpoint_path = os.path.join(args.checkpoint_dir, saved_model_name)
+    if discriminator and d_optimizer:
+        torch.save(
+            {
+                "generator": generator.state_dict(),
+                "discriminator": discriminator.state_dict(),
+                "g_optimizer": g_optimizer.state_dict(),
+                "d_optimizer": d_optimizer.state_dict(),
+                "global_step": step,
+            },
+            checkpoint_path,
+        )
+    else:
+        torch.save(
+            {
+                "generator": generator.state_dict(),
+                "g_optimizer": g_optimizer.state_dict(),
+                "global_step": step,
+            },
+            checkpoint_path,
+        )
+    print("Saved checkpoint: {}".format(checkpoint_path))
+    if os.path.exists(os.path.join(args.checkpoint_dir, "checkpoint")):
+        with open(os.path.join(args.checkpoint_dir, "checkpoint"), "r") as f:
+            ckpts = [x.strip() for x in f.readlines()]
+    else:
+        ckpts = []
+    ckpts.append(saved_model_name)
+    for item in ckpts[:-max_to_keep]:
+        if os.path.exists(os.path.join(args.checkpoint_dir, item)):
+            os.remove(os.path.join(args.checkpoint_dir, item))
+    with open(os.path.join(args.checkpoint_dir, "checkpoint"), "w") as f:
+        for item in ckpts[-max_to_keep:]:
+            f.write("{}\n".format(item))
+def attempt_to_restore(
+    generator, g_optimizer, checkpoint_dir, discriminator=None, d_optimizer=None
+):
+    checkpoint_list = os.path.join(checkpoint_dir, "checkpoint")
+    if os.path.exists(checkpoint_list):
+        checkpoint_filename = open(checkpoint_list).readlines()[-1].strip()
+        checkpoint_path = os.path.join(checkpoint_dir, "{}".format(checkpoint_filename))
+        print("Restore from {}".format(checkpoint_path))
+        checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        if generator:
+            if not list(generator.state_dict().keys())[0].startswith("module."):
+                raw_dict = checkpoint["generator"]
+                clean_dict = OrderedDict()
+                for k, v in raw_dict.items():
+                    if k.startswith("module."):
+                        clean_dict[k[7:]] = v
+                    else:
+                        clean_dict[k] = v
+                generator.load_state_dict(clean_dict)
+            else:
+                generator.load_state_dict(checkpoint["generator"])
+        if g_optimizer:
+            g_optimizer.load_state_dict(checkpoint["g_optimizer"])
+        global_step = 100000
+        if discriminator and "discriminator" in checkpoint.keys():
+            discriminator.load_state_dict(checkpoint["discriminator"])
+            global_step = checkpoint["global_step"]
+            print("restore discriminator")
+        if d_optimizer and "d_optimizer" in checkpoint.keys():
+            d_optimizer.load_state_dict(checkpoint["d_optimizer"])
+            print("restore d_optimizer...")
+    else:
+        global_step = 0
+    return global_step
+class ExponentialMovingAverage(object):
+    def __init__(self, decay):
+        self.decay = decay
+        self.shadow = {}
+    def register(self, name, val):
+        self.shadow[name] = val.clone()
+    def update(self, name, x):
+        assert name in self.shadow
+        update_delta = self.shadow[name] - x
+        self.shadow[name] -= (1.0 - self.decay) * update_delta
+def apply_moving_average(model, ema):
+    for name, param in model.named_parameters():
+        if name in ema.shadow:
+            ema.update(name, param.data)
+def register_model_to_ema(model, ema):
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            ema.register(name, param.data)
+class YParams(HParams):
+    def __init__(self, yaml_file):
+        if not os.path.exists(yaml_file):
+            raise IOError("yaml file: {} is not existed".format(yaml_file))
+        super().__init__()
+        self.d = collections.OrderedDict()
+        with open(yaml_file) as fp:
+            for _, v in yaml().load(fp).items():
+                for k1, v1 in v.items():
+                    try:
+                        if self.get(k1):
+                            self.set_hparam(k1, v1)
+                        else:
+                            self.add_hparam(k1, v1)
+                        self.d[k1] = v1
+                    except Exception:
+                        import traceback
+                        print(traceback.format_exc())
+    # @property
+    def get_elements(self):
+        return self.d.items()
+def override_config(base_config, new_config):
+    """Update new configurations in the original dict with the new dict
+    Args:
+        base_config (dict): original dict to be overridden
+        new_config (dict): dict with new configurations
+    Returns:
+        dict: updated configuration dict
+    """
+    for k, v in new_config.items():
+        if type(v) == dict:
+            if k not in base_config.keys():
+                base_config[k] = {}
+            base_config[k] = override_config(base_config[k], v)
+        else:
+            base_config[k] = v
+    return base_config
+def get_lowercase_keys_config(cfg):
+    """Change all keys in cfg to lower case
+    Args:
+        cfg (dict): dictionary that stores configurations
+    Returns:
+        dict: dictionary that stores configurations
+    """
+    updated_cfg = dict()
+    for k, v in cfg.items():
+        if type(v) == dict:
+            v = get_lowercase_keys_config(v)
+        updated_cfg[k.lower()] = v
+    return updated_cfg
+def _load_config(config_fn, lowercase=False):
+    """Load configurations into a dictionary
+    Args:
+        config_fn (str): path to configuration file
+        lowercase (bool, optional): whether changing keys to lower case. Defaults to False.
+    Returns:
+        dict: dictionary that stores configurations
+    """
+    with open(config_fn, "r") as f:
+        data = f.read()
+    config_ = json5.loads(data)
+    if "base_config" in config_:
+        # load configurations from new path
+        try:
+            p_config_path = os.path.join(os.getenv("WORK_DIR"), config_["base_config"])
+        except:
+            p_config_path = config_["base_config"]
+        p_config_ = _load_config(p_config_path)
+        config_ = override_config(p_config_, config_)
+    if lowercase:
+        # change keys in config_ to lower case
+        config_ = get_lowercase_keys_config(config_)
+    return config_
+def load_config(config_fn, lowercase=False):
+    """Load configurations into a dictionary
+    Args:
+        config_fn (str): path to configuration file
+        lowercase (bool, optional): _description_. Defaults to False.
+    Returns:
+        JsonHParams: an object that stores configurations
+    """
+    config_ = _load_config(config_fn, lowercase=lowercase)
+    # create an JsonHParams object with configuration dict
+    cfg = JsonHParams(**config_)
+    return cfg
+def save_config(save_path, cfg):
+    """Save configurations into a json file
+    Args:
+        save_path (str): path to save configurations
+        cfg (dict): dictionary that stores configurations
+    """
+    with open(save_path, "w") as f:
+        json5.dump(
+            cfg, f, ensure_ascii=False, indent=4, quote_keys=True, sort_keys=True
+        )
+class JsonHParams:
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = JsonHParams(**v)
+            self[k] = v
+    def keys(self):
+        return self.__dict__.keys()
+    def items(self):
+        return self.__dict__.items()
+    def values(self):
+        return self.__dict__.values()
+    def __len__(self):
+        return len(self.__dict__)
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __repr__(self):
+        return self.__dict__.__repr__()
+class ValueWindow:
+    def __init__(self, window_size=100):
+        self._window_size = window_size
+        self._values = []
+    def append(self, x):
+        self._values = self._values[-(self._window_size - 1) :] + [x]
+    @property
+    def sum(self):
+        return sum(self._values)
+    @property
+    def count(self):
+        return len(self._values)
+    @property
+    def average(self):
+        return self.sum / max(1, self.count)
+    def reset(self):
+        self._values = []
+class Logger(object):
+    def __init__(
+        self,
+        filename,
+        level="info",
+        when="D",
+        backCount=10,
+        fmt="%(asctime)s : %(message)s",
+    ):
+        self.level_relations = {
+            "debug": logging.DEBUG,
+            "info": logging.INFO,
+            "warning": logging.WARNING,
+            "error": logging.ERROR,
+            "crit": logging.CRITICAL,
+        }
+        if level == "debug":
+            fmt = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s"
+        self.logger = logging.getLogger(filename)
+        format_str = logging.Formatter(fmt)
+        self.logger.setLevel(self.level_relations.get(level))
+        sh = logging.StreamHandler()
+        sh.setFormatter(format_str)
+        th = handlers.TimedRotatingFileHandler(
+            filename=filename, when=when, backupCount=backCount, encoding="utf-8"
+        )
+        th.setFormatter(format_str)
+        self.logger.addHandler(sh)
+        self.logger.addHandler(th)
+        self.logger.info(
+            "==========================New Starting Here=============================="
+        )
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def slice_segments(x, ids_str, segment_size=4):
+    ret = torch.zeros_like(x[:, :, :segment_size])
+    for i in range(x.size(0)):
+        idx_str = ids_str[i]
+        idx_end = idx_str + segment_size
+        ret[i] = x[i, :, idx_str:idx_end]
+    return ret
+def rand_slice_segments(x, x_lengths=None, segment_size=4):
+    b, d, t = x.size()
+    if x_lengths is None:
+        x_lengths = t
+    ids_str_max = x_lengths - segment_size + 1
+    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
+    ret = slice_segments(x, ids_str, segment_size)
+    return ret, ids_str
+def subsequent_mask(length):
+    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
+    return mask
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+def convert_pad_shape(pad_shape):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def generate_path(duration, mask):
+    """
+    duration: [b, 1, t_x]
+    mask: [b, 1, t_y, t_x]
+    """
+    device = duration.device
+    b, _, t_y, t_x = mask.shape
+    cum_duration = torch.cumsum(duration, -1)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
+    path = path.unsqueeze(1).transpose(2, 3) * mask
+    return path
+def clip_grad_value_(parameters, clip_value, norm_type=2):
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = list(filter(lambda p: p.grad is not None, parameters))
+    norm_type = float(norm_type)
+    if clip_value is not None:
+        clip_value = float(clip_value)
+    total_norm = 0
+    for p in parameters:
+        param_norm = p.grad.data.norm(norm_type)
+        total_norm += param_norm.item() ** norm_type
+        if clip_value is not None:
+            p.grad.data.clamp_(min=-clip_value, max=clip_value)
+    total_norm = total_norm ** (1.0 / norm_type)
+    return total_norm
+def get_current_time():
+    pass
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """
+    Args:
+      lengths:
+        A 1-D tensor containing sentence lengths.
+      max_len:
+        The length of masks.
+    Returns:
+      Return a 2-D bool tensor, where masked positions
+      are filled with `True` and non-masked positions are
+      filled with `False`.
+    >>> lengths = torch.tensor([1, 3, 2, 5])
+    >>> make_pad_mask(lengths)
+    tensor([[False,  True,  True,  True,  True],
+            [False, False, False,  True,  True],
+            [False, False,  True,  True,  True],
+            [False, False, False, False, False]])
+    """
+    assert lengths.ndim == 1, lengths.ndim
+    max_len = max(max_len, lengths.max())
+    n = lengths.size(0)
+    seq_range = torch.arange(0, max_len, device=lengths.device)
+    expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
+    return expaned_lengths >= lengths.unsqueeze(-1)