YingMusic-Singer

Build error

App Files Files Community

xjsc0 commited on Mar 20

Commit

64ec292

1 Parent(s): 4ed4cff

1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +47 -0
src/YingMusicSinger/infer/YingMusicSinger.py +263 -0
src/YingMusicSinger/melody/Gconform.py +298 -0
src/YingMusicSinger/melody/Gconv.py +60 -0
src/YingMusicSinger/melody/SmoothMelody.py +144 -0
src/YingMusicSinger/melody/midi_extractor.py +208 -0
src/YingMusicSinger/models/__init__.py +1 -0
src/YingMusicSinger/models/dit.py +472 -0
src/YingMusicSinger/models/model.py +423 -0
src/YingMusicSinger/models/modules.py +961 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/__init__.py +91 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/chinese_model_g2p.py +209 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/cleaners.py +28 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/english.py +202 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/french.py +149 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/german.py +94 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/korean.py +81 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/mandarin.py +603 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/text_tokenizers.py +82 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p/vocab.json +372 -0
src/YingMusicSinger/utils/f5_tts/g2p/g2p_generation.py +129 -0
src/YingMusicSinger/utils/f5_tts/g2p/infer_dpo.py +277 -0
src/YingMusicSinger/utils/f5_tts/g2p/sources/bpmf_2_pinyin.txt +41 -0
src/YingMusicSinger/utils/f5_tts/g2p/sources/chinese_lexicon.txt +3 -0
src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/config.json +819 -0
src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/poly_bert_model.onnx +3 -0
src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/polychar.txt +159 -0
src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/polydict.json +393 -0
src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/polydict_r.json +393 -0
src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/vocab.txt +0 -0
src/YingMusicSinger/utils/f5_tts/g2p/sources/pinyin_2_bpmf.txt +429 -0
src/YingMusicSinger/utils/f5_tts/g2p/utils/front_utils.py +18 -0
src/YingMusicSinger/utils/f5_tts/g2p/utils/g2p.py +139 -0
src/YingMusicSinger/utils/f5_tts/g2p/utils/log.py +52 -0
src/YingMusicSinger/utils/f5_tts/g2p/utils/mls_en.json +335 -0
src/YingMusicSinger/utils/f5_tts/thirdparty/LangSegment/LangSegment.py +1251 -0
src/YingMusicSinger/utils/f5_tts/thirdparty/LangSegment/__init__.py +24 -0
src/YingMusicSinger/utils/f5_tts/thirdparty/LangSegment/utils/__init__.py +0 -0
src/YingMusicSinger/utils/f5_tts/thirdparty/LangSegment/utils/num.py +332 -0
src/YingMusicSinger/utils/stable_audio_tools/__init__.py +0 -0
src/YingMusicSinger/utils/stable_audio_tools/adp.py +1686 -0
src/YingMusicSinger/utils/stable_audio_tools/autoencoders.py +975 -0
src/YingMusicSinger/utils/stable_audio_tools/blocks.py +398 -0
src/YingMusicSinger/utils/stable_audio_tools/bottleneck copy.py +393 -0
src/YingMusicSinger/utils/stable_audio_tools/bottleneck.py +393 -0
src/YingMusicSinger/utils/stable_audio_tools/conditioners.py +664 -0
src/YingMusicSinger/utils/stable_audio_tools/diffusion.py +740 -0
src/YingMusicSinger/utils/stable_audio_tools/dit.py +451 -0
src/YingMusicSinger/utils/stable_audio_tools/factory.py +185 -0
src/YingMusicSinger/utils/stable_audio_tools/pretransforms.py +425 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,47 @@

+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.avi filter=lfs diff=lfs merge=lfs -text
+*.dylib filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.svg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.m4a filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.mkv filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.docx filter=lfs diff=lfs merge=lfs -text
+*.ppt filter=lfs diff=lfs merge=lfs -text
+*.pptx filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.ico filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.xls filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.webm filter=lfs diff=lfs merge=lfs -text
+*.doc filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.pdf filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.dll filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.mov filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+*.hdf5 filter=lfs diff=lfs merge=lfs -text
+*.ttf filter=lfs diff=lfs merge=lfs -text
+src/YingMusicSinger/utils/f5_tts/g2p/sources/chinese_lexicon.txt filter=lfs diff=lfs merge=lfs -text

src/YingMusicSinger/infer/YingMusicSinger.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import hydra
+import torch
+import torch.nn as nn
+import torchaudio
+from einops import rearrange
+from ema_pytorch import EMA
+from huggingface_hub import PyTorchModelHubMixin
+from omegaconf import OmegaConf
+from src.YingMusicSinger.melody.midi_extractor import MIDIExtractor
+from src.YingMusicSinger.models.model import Singer
+from src.YingMusicSinger.utils.cnen_tokenizer import CNENTokenizer
+from src.YingMusicSinger.utils.lrc_align import (
+    align_lrc_put_to_front,
+    align_lrc_sentence_level,
+)
+from src.YingMusicSinger.utils.mel_spectrogram import MelodySpectrogram
+from src.YingMusicSinger.utils.stable_audio_tools.vae_copysyn import StableAudioInfer
+class YingMusicSinger(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        model_cfg_path,
+        ckpt_path=None,
+        vae_config_path=None,
+        vae_ckpt_path=None,
+        midi_teacher_ckpt_path=None,
+        is_distilled=False,
+        use_ema=True,
+    ):
+        super().__init__()
+        self.cfg = OmegaConf.load(model_cfg_path)
+        model_cls = hydra.utils.get_class(
+            f"src.YingMusicSinger.models.{self.cfg.model.backbone}"
+        )
+        self.melody_input_source = self.cfg.model.melody_input_source
+        self.is_tts_pretrain = self.cfg.model.is_tts_pretrain
+        self.model = Singer(
+            transformer=model_cls(
+                **self.cfg.model.arch,
+                text_num_embeds=self.cfg.datasets_cfg.text_num_embeds,
+                mel_dim=self.cfg.model.mel_spec.n_mel_channels,
+                use_guidance_scale_embed=is_distilled,
+            ),
+            mel_spec_kwargs=self.cfg.model.mel_spec,
+            is_tts_pretrain=self.is_tts_pretrain,
+            melody_input_source=self.melody_input_source,
+            cka_disabled=self.cfg.model.cka_disabled,
+            num_channels=None,
+            extra_parameters=self.cfg.extra_parameters,
+            distill_stage=1,
+            use_guidance_scale_embed=is_distilled,
+        )
+        self.vae = StableAudioInfer(
+            model_config_path=vae_config_path,
+            model_ckpt_path=vae_ckpt_path,
+        )
+        self._need_midi = self.melody_input_source in {
+            "some_pretrain",
+            "some_pretrain_fuzzdisturb",
+            "some_pretrain_postprocess_embedding",
+        }
+        self.midi_teacher = None
+        if self._need_midi:
+            self.midi_teacher = MIDIExtractor()
+            if midi_teacher_ckpt_path is not None:
+                self.midi_teacher._load_form_ckpt(midi_teacher_ckpt_path)
+            for p in self.midi_teacher.parameters():
+                p.requires_grad = False
+            self.melody_spectrogram_extract = MelodySpectrogram()
+        self.vae_frame_rate = 44100 / 2048
+        if ckpt_path is not None:
+            ckpt = torch.load(ckpt_path, map_location="cpu")
+            if use_ema:
+                ema_model = EMA(self.model, include_online_model=False)
+                ema_model.load_state_dict(ckpt["ema_model_state_dict"])
+                self.model = ema_model.ema_model
+            else:
+                self.model.load_state_dict(ckpt["model_state_dict"])
+        self.cnen_tokenizer = CNENTokenizer()
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    def prepare_input(
+        self,
+        ref_audio_path,
+        melody_audio_path,
+        ref_text,
+        target_text,
+        sil_len_to_end,
+        lrc_align_mode,
+    ):
+        ref_audio, ref_audio_sr = torchaudio.load(ref_audio_path)
+        silence = torch.zeros(ref_audio.shape[0], int(ref_audio_sr * sil_len_to_end))
+        ref_wav = torch.cat([ref_audio, silence], dim=1)
+        ref_latent = self.vae.encode_audio(ref_wav, in_sr=ref_audio_sr).transpose(
+            1, 2
+        )  # [B, T, D]
+        melody_wav, melody_sr = torchaudio.load(melody_audio_path)
+        melody_latent = self.vae.encode_audio(melody_wav, in_sr=melody_sr).transpose(
+            1, 2
+        )  # [B, T, D]
+        midi_in = torch.cat([ref_latent, melody_latent], dim=1)
+        if self.is_tts_pretrain:
+            midi_in = torch.zeros_like(midi_in)
+        ref_latent_len = ref_latent.shape[1]
+        total_len = int(ref_latent.shape[1] + melody_latent.shape[1])
+        if self._need_midi:
+            ref_mel = self.melody_spectrogram_extract(audio=ref_wav, sr=ref_audio_sr)
+            melody_mel = self.melody_spectrogram_extract(audio=melody_wav, sr=melody_sr)
+            melody_mel_spec = torch.cat([ref_mel, melody_mel], dim=2)
+        else:
+            raise NotImplementedError()
+        assert isinstance(ref_text, str) and isinstance(target_text, str)
+        text_list = [ref_text] + [target_text]
+        if lrc_align_mode == "put_to_front":
+            lrc_token, _ = align_lrc_put_to_front(
+                tokenizer=self.cnen_tokenizer,
+                lrc_start_times=None,
+                lrc_lines=text_list,
+                total_lens=total_len,
+            )
+        elif lrc_align_mode == "sentence_level":
+            lrc_token, _ = align_lrc_sentence_level(
+                tokenizer=self.cnen_tokenizer,
+                lrc_start_times=[0.0, ref_latent_len / self.vae_frame_rate],
+                lrc_lines=text_list,
+                total_lens=total_len,
+                vae_frame_rate=self.vae_frame_rate,
+            )
+        else:
+            raise ValueError(f"Unsupported lrc_align_mode: {lrc_align_mode}")
+        text_tokens = (
+            torch.tensor(lrc_token, dtype=torch.int64).unsqueeze(0).to(self.device)
+        )
+        midi_p, bound_p = None, None
+        if self._need_midi:
+            with torch.no_grad():
+                midi_p, bound_p = self.midi_teacher(melody_mel_spec.transpose(1, 2))
+        return (
+            ref_latent,
+            ref_latent_len,
+            text_tokens,
+            total_len,
+            midi_in,
+            midi_p,
+            bound_p,
+        )
+    def forward(
+        self,
+        ref_audio_path,
+        melody_audio_path,
+        ref_text,
+        target_text,
+        lrc_align_mode: str = "sentence_level",
+        sil_len_to_end: float = 0.5,
+        t_shift: float = 0.5,
+        nfe_step: int = 32,
+        cfg_strength: float = 3.0,
+        seed: int = 666,
+        is_tts_pretrain: bool = False,
+    ):
+        """
+        Args:
+            ref_audio_path:    Path to the reference audio (for timbre)
+            melody_audio_path: Path to the melody reference audio (provides target duration and melody information)
+            ref_text:          Text corresponding to the reference audio
+            target_text:       Target text to be synthesized
+            lrc_align_mode:    Lyric alignment mode "sentence_level" | "put_to_front"
+            sil_len_to_end:    Duration of silence appended to the end of the reference audio (seconds)
+            t_shift:           Sampling time offset
+            nfe_step:          ODE sampling steps
+            cfg_strength:      CFG strength
+            seed:              Random seed
+            is_tts_pretrain:   If True, melody is not provided (TTS mode)
+        """
+        ref_latent, ref_latent_len, text_tokens, total_len, midi_in, midi_p, bound_p = (
+            self.prepare_input(
+                ref_audio_path=ref_audio_path,
+                melody_audio_path=melody_audio_path,
+                ref_text=ref_text,
+                target_text=target_text,
+                sil_len_to_end=sil_len_to_end,
+                lrc_align_mode=lrc_align_mode,
+            )
+        )
+        assert midi_p is not None and bound_p is not None
+        with torch.inference_mode():
+            generated_latent, _ = self.model.sample(
+                cond=ref_latent,
+                midi_in=midi_in,
+                text=text_tokens,
+                duration=total_len,
+                steps=nfe_step,
+                cfg_strength=cfg_strength,
+                sway_sampling_coef=None,
+                use_epss=False,
+                seed=seed,
+                midi_p=midi_p,
+                t_shift=t_shift,
+                bound_p=bound_p,
+                guidance_scale=cfg_strength,
+            )
+        generated_latent = generated_latent.to(torch.float32)
+        generated_latent = generated_latent[:, ref_latent_len:, :]
+        generated_latent = generated_latent.permute(0, 2, 1)  # [B, D, T]
+        generated_audio = self.vae.decode_audio(generated_latent)
+        audio = rearrange(generated_audio, "b d n -> d (b n)")
+        audio = audio.to(torch.float32).cpu()
+        return audio, 44100
+if __name__ == "__main__":
+    # === Export to HuggingFace safetensors (optional) ===
+    # model = YingMusicSinger(
+    #     model_cfg_path="src/YingMusicSinger/config/YingMusic_Singer.yaml",
+    #     ckpt_path="ckpts/YingMusicSinger_model.pt",
+    #     vae_config_path="src/YingMusicSinger/config/stable_audio_2_0_vae_20hz_official.json",
+    #     vae_ckpt_path="ckpts/stable_audio_2_0_vae_20hz_official.ckpt",
+    #     midi_teacher_ckpt_path="ckpts/model_ckpt_steps_100000_simplified.ckpt",
+    # )
+    # model.save_pretrained("path/to/save")
+    # === Inference Example ===
+    model = YingMusicSinger.from_pretrained("ASLP-lab/YingMusic-Singer")
+    model.to("cuda:0")
+    model.eval()
+    waveform, sample_rate = model(
+        ref_audio_path="path/to/ref_audio",  # Timbre reference audio
+        melody_audio_path="path/to/melody_audio",  # Melody-providing singing clip
+        ref_text="oh the reason i hold on",  # Lyrics corresponding to ref_audio
+        target_text="oldest book broken watch|bare feet in grassy spot",  # Modified target lyrics
+        seed=42,
+    )
+    torchaudio.save("output.wav", waveform, sample_rate=sample_rate)
+    print("Saved to output.wav")

src/YingMusicSinger/melody/Gconform.py ADDED Viewed

	@@ -0,0 +1,298 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+class GLU(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        out, gate = x.chunk(2, dim=self.dim)
+        return out * gate.sigmoid()
+class conform_conv(nn.Module):
+    def __init__(
+        self, channels: int, kernel_size: int = 31, DropoutL=0.1, bias: bool = True
+    ):
+        super().__init__()
+        self.act2 = nn.SiLU()
+        self.act1 = GLU(1)
+        self.pointwise_conv1 = nn.Conv1d(
+            channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0:
+        #    it's a causal convolution, the input will be padded with
+        #    `self.lorder` frames on the left in forward (causal conv impl).
+        # else: it's a symmetrical convolution
+        assert (kernel_size - 1) % 2 == 0
+        padding = (kernel_size - 1) // 2
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+        self.norm = nn.BatchNorm1d(channels)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels, channels, kernel_size=1, stride=1, padding=0, bias=bias
+        )
+        self.drop = nn.Dropout(DropoutL) if DropoutL > 0.0 else nn.Identity()
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = self.act1(self.pointwise_conv1(x))
+        x = self.depthwise_conv(x)
+        x = self.norm(x)
+        x = self.act2(x)
+        x = self.pointwise_conv2(x)
+        return self.drop(x).transpose(1, 2)
+class Attention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32, conditiondim=None):
+        super().__init__()
+        if conditiondim is None:
+            conditiondim = dim
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_q = nn.Linear(dim, hidden_dim, bias=False)
+        self.to_kv = nn.Linear(conditiondim, hidden_dim * 2, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Linear(
+                hidden_dim,
+                dim,
+            ),
+        )
+    def forward(self, q, kv=None, mask=None):
+        # b, c, h, w = x.shape
+        if kv is None:
+            kv = q
+        # q, kv = map(
+        #     lambda t: rearrange(t, "b c t -> b t c", ), (q, kv)
+        # )
+        q = self.to_q(q)
+        k, v = self.to_kv(kv).chunk(2, dim=2)
+        q, k, v = map(
+            lambda t: rearrange(t, "b t (h c) -> b h t c", h=self.heads), (q, k, v)
+        )
+        if mask is not None:
+            mask = mask.unsqueeze(1).unsqueeze(1)
+        with torch.backends.cuda.sdp_kernel(enable_math=False):
+            out = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        out = rearrange(
+            out,
+            "b h t c -> b t (h c) ",
+            h=self.heads,
+        )
+        return self.to_out(out)
+class conform_ffn(nn.Module):
+    def __init__(self, dim, DropoutL1: float = 0.1, DropoutL2: float = 0.1):
+        super().__init__()
+        self.ln1 = nn.Linear(dim, dim * 4)
+        self.ln2 = nn.Linear(dim * 4, dim)
+        self.drop1 = nn.Dropout(DropoutL1) if DropoutL1 > 0.0 else nn.Identity()
+        self.drop2 = nn.Dropout(DropoutL2) if DropoutL2 > 0.0 else nn.Identity()
+        self.act = nn.SiLU()
+    def forward(self, x):
+        x = self.ln1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.ln2(x)
+        return self.drop2(x)
+class conform_blocke(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        kernel_size: int = 31,
+        conv_drop: float = 0.1,
+        ffn_latent_drop: float = 0.1,
+        ffn_out_drop: float = 0.1,
+        attention_drop: float = 0.1,
+        attention_heads: int = 4,
+        attention_heads_dim: int = 64,
+    ):
+        super().__init__()
+        self.ffn1 = conform_ffn(dim, ffn_latent_drop, ffn_out_drop)
+        self.ffn2 = conform_ffn(dim, ffn_latent_drop, ffn_out_drop)
+        self.att = Attention(dim, heads=attention_heads, dim_head=attention_heads_dim)
+        self.attdrop = (
+            nn.Dropout(attention_drop) if attention_drop > 0.0 else nn.Identity()
+        )
+        self.conv = conform_conv(
+            dim,
+            kernel_size=kernel_size,
+            DropoutL=conv_drop,
+        )
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.norm4 = nn.LayerNorm(dim)
+        self.norm5 = nn.LayerNorm(dim)
+    def forward(
+        self,
+        x,
+        mask=None,
+    ):
+        x = self.ffn1(self.norm1(x)) * 0.5 + x
+        x = self.attdrop(self.att(self.norm2(x), mask=mask)) + x
+        x = self.conv(self.norm3(x)) + x
+        x = self.ffn2(self.norm4(x)) * 0.5 + x
+        return self.norm5(x)
+        # return x
+class Gcf(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        kernel_size: int = 31,
+        conv_drop: float = 0.1,
+        ffn_latent_drop: float = 0.1,
+        ffn_out_drop: float = 0.1,
+        attention_drop: float = 0.1,
+        attention_heads: int = 4,
+        attention_heads_dim: int = 64,
+    ):
+        super().__init__()
+        self.att1 = conform_blocke(
+            dim=dim,
+            kernel_size=kernel_size,
+            conv_drop=conv_drop,
+            ffn_latent_drop=ffn_latent_drop,
+            ffn_out_drop=ffn_out_drop,
+            attention_drop=attention_drop,
+            attention_heads=attention_heads,
+            attention_heads_dim=attention_heads_dim,
+        )
+        self.att2 = conform_blocke(
+            dim=dim,
+            kernel_size=kernel_size,
+            conv_drop=conv_drop,
+            ffn_latent_drop=ffn_latent_drop,
+            ffn_out_drop=ffn_out_drop,
+            attention_drop=attention_drop,
+            attention_heads=attention_heads,
+            attention_heads_dim=attention_heads_dim,
+        )
+        self.glu1 = nn.Sequential(nn.Linear(dim, dim * 2), GLU(2))
+        self.glu2 = nn.Sequential(nn.Linear(dim, dim * 2), GLU(2))
+    def forward(self, midi, bound):
+        midi = self.att1(midi)
+        bound = self.att2(bound)
+        midis = self.glu1(midi)
+        bounds = self.glu2(bound)
+        return midi + bounds, bound + midis
+class Gmidi_conform(nn.Module):
+    def __init__(
+        self,
+        lay: int,
+        dim: int,
+        indim: int,
+        outdim: int,
+        use_lay_skip: bool,
+        kernel_size: int = 31,
+        conv_drop: float = 0.1,
+        ffn_latent_drop: float = 0.1,
+        ffn_out_drop: float = 0.1,
+        attention_drop: float = 0.1,
+        attention_heads: int = 4,
+        attention_heads_dim: int = 64,
+    ):
+        super().__init__()
+        self.inln = nn.Linear(indim, dim)
+        self.inln1 = nn.Linear(indim, dim)
+        self.outln = nn.Linear(dim, outdim)
+        self.cutheard = nn.Linear(dim, 1)
+        # self.cutheard = nn.Linear(dim, outdim)
+        self.lay = lay
+        self.use_lay_skip = use_lay_skip
+        self.cf_lay = nn.ModuleList(
+            [
+                Gcf(
+                    dim=dim,
+                    kernel_size=kernel_size,
+                    conv_drop=conv_drop,
+                    ffn_latent_drop=ffn_latent_drop,
+                    ffn_out_drop=ffn_out_drop,
+                    attention_drop=attention_drop,
+                    attention_heads=attention_heads,
+                    attention_heads_dim=attention_heads_dim,
+                )
+                for _ in range(lay)
+            ]
+        )
+        self.att1 = conform_blocke(
+            dim=dim,
+            kernel_size=kernel_size,
+            conv_drop=conv_drop,
+            ffn_latent_drop=ffn_latent_drop,
+            ffn_out_drop=ffn_out_drop,
+            attention_drop=attention_drop,
+            attention_heads=attention_heads,
+            attention_heads_dim=attention_heads_dim,
+        )
+        self.att2 = conform_blocke(
+            dim=dim,
+            kernel_size=kernel_size,
+            conv_drop=conv_drop,
+            ffn_latent_drop=ffn_latent_drop,
+            ffn_out_drop=ffn_out_drop,
+            attention_drop=attention_drop,
+            attention_heads=attention_heads,
+            attention_heads_dim=attention_heads_dim,
+        )
+    def forward(self, x, mask=None):
+        x1 = x.clone()
+        x = self.inln(x)
+        x1 = self.inln1(x1)
+        if mask is not None:
+            x = x.masked_fill(~mask.unsqueeze(-1), 0)
+        for idx, i in enumerate(self.cf_lay):
+            x, x1 = i(x, x1)
+            if mask is not None:
+                x = x.masked_fill(~mask.unsqueeze(-1), 0)
+        x, x1 = self.att1(x), self.att2(x1)
+        cutprp = self.cutheard(x1)
+        midiout = self.outln(x)
+        return midiout, cutprp

src/YingMusicSinger/melody/Gconv.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import torch.nn as nn
+class GLU(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        out, gate = x.chunk(2, dim=self.dim)
+        return out * gate.sigmoid()
+class conform_conv(nn.Module):
+    def __init__(
+        self, channels: int, kernel_size: int = 31, DropoutL=0.1, bias: bool = True
+    ):
+        super().__init__()
+        self.act2 = nn.SiLU()
+        self.act1 = GLU(1)
+        self.pointwise_conv1 = nn.Conv1d(
+            channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias
+        )
+        # self.lorder is used to distinguish if it's a causal convolution,
+        # if self.lorder > 0:
+        #    it's a causal convolution, the input will be padded with
+        #    `self.lorder` frames on the left in forward (causal conv impl).
+        # else: it's a symmetrical convolution
+        assert (kernel_size - 1) % 2 == 0
+        padding = (kernel_size - 1) // 2
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=padding,
+            groups=channels,
+            bias=bias,
+        )
+        self.norm = nn.BatchNorm1d(channels)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels, channels, kernel_size=1, stride=1, padding=0, bias=bias
+        )
+        self.drop = nn.Dropout(DropoutL) if DropoutL > 0.0 else nn.Identity()
+    def forward(self, x):
+        x = x.transpose(1, 2)
+        x = self.act1(self.pointwise_conv1(x))
+        x = self.depthwise_conv(x)
+        x = self.norm(x)
+        x = self.act2(x)
+        x = self.pointwise_conv2(x)
+        return self.drop(x).transpose(1, 2)

src/YingMusicSinger/melody/SmoothMelody.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+import torch.nn as nn
+class MIDIFuzzDisturb(nn.Module):
+    """Applies fuzzing perturbations to MIDI latent representations.
+    The raw MIDI teacher model output preserves good prosody but causes
+    pronunciation interference. This module mitigates that by applying
+    blur, temporal dropout, and noise to the melody latent.
+    """
+    def __init__(
+        self, dim=128, drop_prob=0.3, noise_scale=0.1, blur_kernel=3, drop_type="random"
+    ):
+        super().__init__()
+        self.blur = None
+        self.drop_prob = None
+        self.noise_scale = None
+        self.dim = dim
+        self.drop_type = drop_type
+        assert drop_prob is not None
+        assert drop_type is not None
+        if drop_type == "random":
+            # drop_prob is a float
+            if drop_prob != 0:
+                self.drop_prob = drop_prob
+        elif drop_type == "equal_space":
+            # drop_prob is a [drop, keep] list, e.g., [1, 1] means 1 frame drop, 1 frame keep
+            self.drop_prob = drop_prob
+        else:
+            raise ValueError(f"Unknown drop_type: {drop_type}")
+        if noise_scale != 0:
+            self.noise_scale = noise_scale
+        if blur_kernel != 0:
+            assert blur_kernel % 2 == 1, f"blur_kernel {blur_kernel} must be odd"
+            self.blur = nn.AvgPool1d(
+                kernel_size=blur_kernel, stride=1, padding=blur_kernel // 2
+            )
+    def _create_equal_space_mask(self, batch_size, seq_len, device):
+        """Create an equally-spaced mask cycling [drop, keep] frames."""
+        drop_frames, keep_frames = self.drop_prob
+        cycle_len = drop_frames + keep_frames
+        # Pattern: first drop_frames are 0 (drop), next keep_frames are 1 (keep)
+        pattern = torch.cat(
+            [
+                torch.zeros(drop_frames, device=device),
+                torch.ones(keep_frames, device=device),
+            ]
+        )
+        # Repeat pattern to cover the full sequence length
+        num_repeats = (seq_len + cycle_len - 1) // cycle_len
+        mask = pattern.repeat(num_repeats)[:seq_len]  # [T]
+        # Expand to [B, T, 1]
+        mask = mask.view(1, seq_len, 1).expand(batch_size, -1, -1)
+        return mask
+    def forward(self, x):
+        # x: [B, T, D=128], pre-sigmoid logits
+        x = torch.sigmoid(x)
+        assert x.shape[-1] == self.dim, (
+            f"MIDIFuzzDisturb: expected dim={self.dim}, got {x.shape[-1]}"
+        )
+        if self.blur:
+            x = self.blur(x.transpose(1, 2)).transpose(1, 2)
+        if self.drop_prob:
+            if self.drop_type == "random":
+                time_mask = (
+                    torch.rand(x.shape[0], x.shape[1], 1, device=x.device)
+                    > self.drop_prob
+                )
+                x = x * time_mask.float()
+            elif self.drop_type == "equal_space":
+                time_mask = self._create_equal_space_mask(
+                    x.shape[0], x.shape[1], x.device
+                )
+                x = x * time_mask.float()
+            else:
+                raise ValueError(f"Unknown drop_type: {self.drop_type}")
+        if self.noise_scale:
+            noise = torch.randn_like(x) * self.noise_scale
+            x = x + noise
+        return x
+class MIDIDigitalEmbedding(nn.Module):
+    """Embeds continuous MIDI values into discrete token embeddings.
+    Continuous MIDI values in [0, 127] are quantized at a configurable
+    resolution (mark_distinguish_scale) and mapped to learned embeddings.
+    """
+    def __init__(self, embed_dim=128, num_classes=128, mark_distinguish_scale=2):
+        super().__init__()
+        # num_classes covers the input range [0, 127] plus 2 special tokens
+        self.num_classes = num_classes + 2
+        self.mark_distinguish_scale = mark_distinguish_scale
+        self.embedding_input_num_class = self.num_classes * self.mark_distinguish_scale
+        self.embedding = nn.Embedding(self.embedding_input_num_class, embed_dim)
+    def midi_to_class(self, midi_values):
+        """Map continuous MIDI values to discrete class indices.
+        Args:
+            midi_values: [B, T] continuous MIDI values, roughly in [0, 127]
+        Returns:
+            class_indices: [B, T] discrete class indices
+        """
+        # Round to nearest quantization step
+        # e.g., with scale=2: 0->0, 0.3->1, 0.5->1, 0.8->2, 1.0->2, ...
+        class_indices = torch.round(midi_values * self.mark_distinguish_scale).long()
+        # Clamp to valid range
+        class_indices = torch.clamp(
+            class_indices, 0, self.embedding_input_num_class - 1
+        )
+        return class_indices
+    def forward(self, midi_values):
+        """
+        Args:
+            midi_values: [B, T] continuous MIDI values
+        Returns:
+            embeddings: [B, T, embed_dim] embedding vectors
+        """
+        class_indices = self.midi_to_class(midi_values)
+        embeddings = self.embedding(class_indices)
+        return embeddings

src/YingMusicSinger/melody/midi_extractor.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+from src.YingMusicSinger.melody.Gconform import Gmidi_conform
+# midi decoding utils
+def decode_gaussian_blurred_probs(probs, vmin, vmax, deviation, threshold):
+    num_bins = int(probs.shape[-1])
+    interval = (vmax - vmin) / (num_bins - 1)
+    width = int(3 * deviation / interval)  # 3 * sigma
+    idx = torch.arange(num_bins, device=probs.device)[None, None, :]  # [1, 1, N]
+    idx_values = idx * interval + vmin
+    center = torch.argmax(probs, dim=-1, keepdim=True)  # [B, T, 1]
+    start = torch.clip(center - width, min=0)  # [B, T, 1]
+    end = torch.clip(center + width + 1, max=num_bins)  # [B, T, 1]
+    idx_masks = (idx >= start) & (idx < end)  # [B, T, N]
+    weights = probs * idx_masks  # [B, T, N]
+    product_sum = torch.sum(weights * idx_values, dim=2)  # [B, T]
+    weight_sum = torch.sum(weights, dim=2)  # [B, T]
+    values = product_sum / (
+        weight_sum + (weight_sum == 0)
+    )  # avoid dividing by zero, [B, T]
+    rest = probs.max(dim=-1)[0] < threshold  # [B, T]
+    return values, rest
+def decode_bounds_to_alignment(bounds, use_diff=True):
+    bounds_step = bounds.cumsum(dim=1).round().long()
+    if use_diff:
+        bounds_inc = (
+            torch.diff(
+                bounds_step,
+                dim=1,
+                prepend=torch.full(
+                    (bounds.shape[0], 1),
+                    fill_value=-1,
+                    dtype=bounds_step.dtype,
+                    device=bounds_step.device,
+                ),
+            )
+            > 0
+        )
+    else:
+        bounds_inc = F.pad(
+            (bounds_step[:, 1:] > bounds_step[:, :-1]), [1, 0], value=True
+        )
+    frame2item = bounds_inc.long().cumsum(dim=1)
+    return frame2item
+def decode_note_sequence(frame2item, values, masks, threshold=0.5):
+    """
+    :param frame2item: [1, 1, 1, 1, 2, 2, 3, 3, 3]
+    :param values:
+    :param masks:
+    :param threshold: minimum ratio of unmasked frames required to be regarded as an unmasked item
+    :return: item_values, item_dur, item_masks
+    """
+    b = frame2item.shape[0]
+    space = frame2item.max() + 1
+    item_dur = frame2item.new_zeros(b, space, dtype=frame2item.dtype).scatter_add(
+        1, frame2item, torch.ones_like(frame2item)
+    )[:, 1:]
+    item_unmasked_dur = frame2item.new_zeros(
+        b, space, dtype=frame2item.dtype
+    ).scatter_add(1, frame2item, masks.long())[:, 1:]
+    item_masks = item_unmasked_dur / item_dur >= threshold
+    values_quant = values.round().long()
+    histogram = (
+        frame2item.new_zeros(b, space * 128, dtype=frame2item.dtype)
+        .scatter_add(
+            1, frame2item * 128 + values_quant, torch.ones_like(frame2item) * masks
+        )
+        .unflatten(1, [space, 128])[:, 1:, :]
+    )
+    item_values_center = histogram.float().argmax(dim=2).to(dtype=values.dtype)
+    values_center = torch.gather(F.pad(item_values_center, [1, 0]), 1, frame2item)
+    values_near_center = (
+        masks & (values >= values_center - 0.5) & (values <= values_center + 0.5)
+    )
+    item_valid_dur = frame2item.new_zeros(b, space, dtype=frame2item.dtype).scatter_add(
+        1, frame2item, values_near_center.long()
+    )[:, 1:]
+    item_values = values.new_zeros(b, space, dtype=values.dtype).scatter_add(
+        1, frame2item, values * values_near_center
+    )[:, 1:] / (item_valid_dur + (item_valid_dur == 0))
+    return item_values, item_dur, item_masks
+def expand_batch_padded(feature_tensor, counts_tensor, padding_value=0.0):
+    assert feature_tensor.dim() == 2 and counts_tensor.dim() == 2
+    lengths = torch.sum(counts_tensor, dim=1)
+    feature_tensor = feature_tensor.reshape(-1)
+    counts_tensor = counts_tensor.reshape(-1)
+    expanded_flat = torch.repeat_interleave(feature_tensor, counts_tensor)
+    ragged_list = torch.split(expanded_flat, lengths.tolist())
+    padded_tensor = pad_sequence(
+        ragged_list, batch_first=True, padding_value=padding_value
+    )
+    return padded_tensor, lengths
+class midi_loss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.loss = nn.BCELoss()
+    def forward(self, x, target):
+        midiout, cutp = x
+        midi_target, cutp_target = target
+        cutploss = self.loss(cutp, cutp_target)
+        midiloss = self.loss(midiout, midi_target)
+        return midiloss, cutploss
+class MIDIExtractor(nn.Module):
+    def __init__(self, in_dim=None, out_dim=None):
+        super().__init__()
+        cfg = {
+            "attention_drop": 0.1,
+            "attention_heads": 8,
+            "attention_heads_dim": 64,
+            "conv_drop": 0.1,
+            "dim": 512,
+            "ffn_latent_drop": 0.1,
+            "ffn_out_drop": 0.1,
+            "kernel_size": 31,
+            "lay": 8,
+            "use_lay_skip": True,
+            "indim": 80,
+            "outdim": 128,
+        }
+        if in_dim is not None:
+            cfg["indim"] = in_dim
+        if out_dim is not None:
+            cfg["outdim"] = out_dim
+        self.midi_conform = Gmidi_conform(**cfg)
+        self.midi_min = 0
+        self.midi_max = 127
+        self.midi_deviation = 1.0
+        self.rest_threshold = 0.1
+    def _load_form_ckpt(self, ckpt_path, device="cpu"):
+        from collections import OrderedDict
+        if ckpt_path is None:
+            raise ValueError("midi_extractor_path is required")
+        state_dict = torch.load(ckpt_path, map_location="cpu")["state_dict"]
+        prefix_in_ckpt = "model.model"
+        state_dict = OrderedDict(
+            {
+                k.replace(f"{prefix_in_ckpt}.", "midi_conform."): v
+                for k, v in state_dict.items()
+                if k.startswith(f"{prefix_in_ckpt}.")
+            }
+        )
+        self.load_state_dict(state_dict, strict=True)
+        # self.to(device)
+    def forward(self, x, mask=None):
+        midi, bound = self.midi_conform(x, mask)
+        return midi, bound
+    def postprocess(self, midi, bounds, with_expand=False):
+        probs = torch.sigmoid(midi)
+        bound_probs = torch.sigmoid(bounds)
+        bound_probs = torch.squeeze(bound_probs, -1)
+        masks = torch.ones_like(bound_probs).bool()
+        # Avoid in-place ops on tensors needed for autograd (outputs of SigmoidBackward)
+        probs = probs * masks[..., None]
+        bound_probs = bound_probs * masks
+        unit2note_pred = decode_bounds_to_alignment(bound_probs) * masks
+        midi_pred, rest_pred = decode_gaussian_blurred_probs(
+            probs,
+            vmin=self.midi_min,
+            vmax=self.midi_max,
+            deviation=self.midi_deviation,
+            threshold=self.rest_threshold,
+        )
+        note_midi_pred, note_dur_pred, note_mask_pred = decode_note_sequence(
+            unit2note_pred, midi_pred, ~rest_pred & masks
+        )
+        if not with_expand:
+            return note_midi_pred, note_dur_pred
+        note_midi_expand, _ = expand_batch_padded(note_midi_pred, note_dur_pred)
+        return note_midi_expand, None

src/YingMusicSinger/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .dit import DiT

src/YingMusicSinger/models/dit.py ADDED Viewed

	@@ -0,0 +1,472 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import torch
+import torch.nn.functional as F
+from torch import nn
+from x_transformers.x_transformers import RotaryEmbedding
+from src.YingMusicSinger.models.modules import (
+    AdaLayerNorm_Final,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
+    DiTBlock,
+    TimestepGuidanceEmbedding,
+    get_pos_embed_indices,
+    precompute_freqs_cis,
+)
+# Text embedding
+class TextEmbedding(nn.Module):
+    def __init__(
+        self,
+        text_num_embeds,
+        text_dim,
+        mask_padding=False,
+        average_upsampling=False,
+        conv_layers=0,
+        conv_mult=2,
+    ):
+        super().__init__()
+        self.text_embed = nn.Embedding(
+            text_num_embeds + 1, text_dim
+        )  # index 0 reserved as filler token
+        self.mask_padding = mask_padding
+        self.average_upsampling = average_upsampling  # ZipVoice-style late average upsampling (after text encoder)
+        if average_upsampling:
+            assert mask_padding, (
+                "text_embedding_average_upsampling requires text_mask_padding to be True"
+            )
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24kHz audio
+            self.register_buffer(
+                "freqs_cis",
+                precompute_freqs_cis(text_dim, self.precompute_max_pos),
+                persistent=False,
+            )
+            self.text_blocks = nn.Sequential(
+                *[
+                    ConvNeXtV2Block(text_dim, text_dim * conv_mult)
+                    for _ in range(conv_layers)
+                ]
+            )
+        else:
+            self.extra_modeling = False
+        print(
+            f"[info] TextEmbedding: mask_padding={mask_padding}, average_upsampling={average_upsampling}, conv_layers={conv_layers}"
+        )
+    def average_upsample_text_by_mask(self, text, text_mask, audio_mask):
+        batch, text_len, text_dim = text.shape
+        if audio_mask is None:
+            audio_mask = torch.ones_like(text_mask, dtype=torch.bool)
+        valid_mask = audio_mask & text_mask
+        audio_lens = audio_mask.sum(dim=1)  # [batch]
+        valid_lens = valid_mask.sum(dim=1)  # [batch]
+        upsampled_text = torch.zeros_like(text)
+        for i in range(batch):
+            audio_len = audio_lens[i].item()
+            valid_len = valid_lens[i].item()
+            if valid_len == 0:
+                continue
+            valid_ind = torch.where(valid_mask[i])[0]
+            valid_data = text[i, valid_ind, :]  # [valid_len, text_dim]
+            base_repeat = audio_len // valid_len
+            remainder = audio_len % valid_len
+            indices = []
+            for j in range(valid_len):
+                repeat_count = base_repeat + (1 if j >= valid_len - remainder else 0)
+                indices.extend([j] * repeat_count)
+            indices = torch.tensor(
+                indices[:audio_len], device=text.device, dtype=torch.long
+            )
+            upsampled = valid_data[indices]  # [audio_len, text_dim]
+            upsampled_text[i, :audio_len, :] = upsampled
+        return upsampled_text
+    def forward(
+        self,
+        text: int["b nt"],
+        seq_len,
+        drop_text=False,
+        audio_mask: bool["b n"] | None = None,
+    ):  # noqa: F722
+        # Text tokens start from 0; shift by 1 so that 0 is never a valid token
+        text = text + 1
+        # Note: 1 is used as the PAD token
+        text = text[
+            :, :seq_len
+        ]  # Truncate if text tokens exceed mel spectrogram length
+        batch, text_len = text.shape[0], text.shape[1]
+        text = F.pad(text, (0, seq_len - text_len), value=1)
+        if self.mask_padding:
+            text_mask = text == 1
+        else:
+            text_mask = torch.zeros(
+                (batch, seq_len), device=text.device, dtype=torch.bool
+            )
+        if drop_text:  # CFG for text
+            text = torch.zeros_like(text)
+        text = self.text_embed(text)  # b n -> b n d
+        # Optional extra modeling
+        if self.extra_modeling:
+            # Sinusoidal positional embedding
+            batch_start = torch.zeros((batch,), device=text.device, dtype=torch.long)
+            pos_idx = get_pos_embed_indices(
+                batch_start, seq_len, max_pos=self.precompute_max_pos
+            )
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+            # ConvNeXtV2 blocks
+            if self.mask_padding:
+                text = text.masked_fill(
+                    text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0
+                )
+                for block in self.text_blocks:
+                    text = block(text)
+                    text = text.masked_fill(
+                        text_mask.unsqueeze(-1).expand(-1, -1, text.size(-1)), 0.0
+                    )
+            else:
+                text = self.text_blocks(text)
+        if self.average_upsampling:
+            text = self.average_upsample_text_by_mask(text, ~text_mask, audio_mask)
+        return text, text_mask
+# Noised input audio and context mixing embedding
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim, midi_dim=128):
+        super().__init__()
+        self.proj = nn.Linear(mel_dim * 2 + text_dim + midi_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+        self.midi_proj = nn.Linear(128, 128)
+    def forward(
+        self,
+        x: float["b n d"],  # noqa: F722
+        cond: float["b n d"],  # noqa: F722
+        text_embed: float["b n d"],  # noqa: F722
+        midi,
+        drop_audio_cond=False,
+        drop_midi=False,
+    ):
+        if drop_audio_cond:  # CFG for conditioning audio
+            cond = torch.zeros_like(cond)
+        midi = self.midi_proj(midi)
+        if drop_midi:  # CFG for melody
+            midi = torch.zeros_like(midi)
+        x = self.proj(torch.cat((x, cond, text_embed, midi), dim=-1))
+        x = self.conv_pos_embed(x) + x
+        return x
+# Transformer backbone using DiT blocks
+class DiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        mel_dim=100,
+        text_num_embeds=256,
+        text_dim=None,
+        n_f0_bins=512,
+        text_mask_padding=True,
+        text_embedding_average_upsampling=False,
+        qk_norm=None,
+        conv_layers=0,
+        pe_attn_head=None,
+        attn_backend="torch",  # "torch" | "flash_attn"
+        attn_mask_enabled=False,
+        long_skip_connection=False,
+        checkpoint_activations=False,
+        use_guidance_scale_embed: bool = False,
+        guidance_scale_embed_dim: int = 192,
+    ):
+        super().__init__()
+        self.time_embed = TimestepGuidanceEmbedding(
+            dim,
+            use_guidance_scale_embed=use_guidance_scale_embed,
+            guidance_scale_embed_dim=guidance_scale_embed_dim,
+        )
+        if text_dim is None:
+            text_dim = mel_dim
+        self.text_embed_p = TextEmbedding(
+            text_num_embeds,
+            text_dim,
+            mask_padding=text_mask_padding,
+            average_upsampling=text_embedding_average_upsampling,
+            conv_layers=conv_layers,
+        )
+        self.text_cond, self.text_uncond = None, None  # text cache
+        self.input_embed_with_midi = InputEmbedding(mel_dim, text_dim, dim)
+        self.rotary_embed = RotaryEmbedding(dim_head)
+        self.use_guidance_scale_embed = use_guidance_scale_embed
+        self.dim = dim
+        self.depth = depth
+        self.transformer_blocks = nn.ModuleList(
+            [
+                DiTBlock(
+                    dim=dim,
+                    heads=heads,
+                    dim_head=dim_head,
+                    ff_mult=ff_mult,
+                    dropout=dropout,
+                    qk_norm=qk_norm,
+                    pe_attn_head=pe_attn_head,
+                    attn_backend=attn_backend,
+                    attn_mask_enabled=attn_mask_enabled,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.long_skip_connection = (
+            nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
+        )
+        self.norm_out = AdaLayerNorm_Final(dim)  # Final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+        self.checkpoint_activations = checkpoint_activations
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Zero-out AdaLN layers in DiT blocks
+        for block in self.transformer_blocks:
+            nn.init.constant_(block.attn_norm.linear.weight, 0)
+            nn.init.constant_(block.attn_norm.linear.bias, 0)
+        # Zero-out output layers
+        nn.init.constant_(self.norm_out.linear.weight, 0)
+        nn.init.constant_(self.norm_out.linear.bias, 0)
+        nn.init.constant_(self.proj_out.weight, 0)
+        nn.init.constant_(self.proj_out.bias, 0)
+        nn.init.zeros_(self.input_embed_with_midi.midi_proj.weight)
+        nn.init.zeros_(self.input_embed_with_midi.midi_proj.bias)
+    def ckpt_wrapper(self, module):
+        # Ref: https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
+        def ckpt_forward(*inputs):
+            outputs = module(*inputs)
+            return outputs
+        return ckpt_forward
+    def get_input_embed(
+        self,
+        x,  # b n d
+        cond,  # b n d
+        text,  # b nt
+        midi,  # b n
+        drop_audio_cond: bool = False,
+        drop_text: bool = False,
+        drop_midi: bool = False,
+        cache: bool = True,
+        audio_mask: bool["b n"] | None = None,  # noqa: F722
+    ):
+        seq_len = x.shape[1]
+        if cache:
+            if drop_text:
+                if self.text_uncond is None:
+                    self.text_uncond, _ = self.text_embed_p(
+                        text, seq_len, drop_text=True, audio_mask=audio_mask
+                    )
+                text_embed = self.text_uncond
+            else:
+                if self.text_cond is None:
+                    self.text_cond, _ = self.text_embed_p(
+                        text, seq_len, drop_text=False, audio_mask=audio_mask
+                    )
+                text_embed = self.text_cond
+        else:
+            text_embed, text_mask = self.text_embed_p(
+                text, seq_len, drop_text=drop_text, audio_mask=audio_mask
+            )
+        if midi is None:
+            midi = torch.zeros(
+                (x.size(0), x.size(1)), device=x.device, dtype=torch.long
+            )
+        x = self.input_embed_with_midi(
+            x,
+            cond,
+            text_embed,
+            midi,
+            drop_audio_cond=drop_audio_cond,
+            drop_midi=drop_midi,
+        )
+        return x, None
+    def clear_cache(self):
+        self.text_cond, self.text_uncond = None, None
+    def forward(
+        self,
+        x: float["b n d"],  # Noised input audio  # noqa: F722
+        cond: float["b n d"],  # Masked conditioning audio  # noqa: F722
+        text: int["b nt"],  # Text tokens  # noqa: F722
+        time: float["b"] | float[""],  # Timestep  # noqa: F821 F722
+        midi: float["b n"] | None = None,  # Melody latent  # noqa: F722
+        mask: bool["b n"] | None = None,  # noqa: F722
+        drop_audio_cond: bool = False,  # CFG for conditioning audio
+        drop_text: bool = False,  # CFG for text
+        drop_midi: bool = False,  # CFG for melody
+        cfg_infer: bool = False,  # CFG inference: pack cond & uncond forward
+        cache: bool = False,
+        guidance_scale=None,
+        cfg_infer_ids=None,  # tuple(bool): (x_cond, x_uncond, x_uncond_cc, x_drop_all_cond)
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+        # Timestep embedding (with optional distillation guidance scale)
+        t = self.time_embed(time, guidance_scale=guidance_scale)
+        if cfg_infer:  # Pack cond & uncond forward: b n d -> Kb n d
+            x_cond, x_uncond, x_uncond_cc, x_drop_all_cond = None, None, None, None
+            if cfg_infer_ids is None or cfg_infer_ids[0]:
+                x_cond, _ = self.get_input_embed(
+                    x,
+                    cond,
+                    text,
+                    midi,
+                    drop_audio_cond=False,
+                    drop_text=False,
+                    drop_midi=False,
+                    cache=cache,
+                    audio_mask=mask,
+                )
+            if cfg_infer_ids is None or cfg_infer_ids[1]:
+                x_uncond, _ = self.get_input_embed(
+                    x,
+                    cond,
+                    text,
+                    midi,
+                    drop_audio_cond=True,
+                    drop_text=False,
+                    drop_midi=False,
+                    cache=cache,
+                    audio_mask=mask,
+                )
+            if cfg_infer_ids is None or cfg_infer_ids[2]:
+                x_uncond_cc, _ = self.get_input_embed(
+                    x,
+                    cond,
+                    text,
+                    midi,
+                    drop_audio_cond=False,
+                    drop_text=True,
+                    drop_midi=True,
+                    cache=cache,
+                    audio_mask=mask,
+                )
+            if cfg_infer_ids is None or cfg_infer_ids[3]:
+                x_drop_all_cond, _ = self.get_input_embed(
+                    x,
+                    cond,
+                    text,
+                    midi,
+                    drop_audio_cond=True,
+                    drop_text=True,
+                    drop_midi=True,
+                    cache=cache,
+                    audio_mask=mask,
+                )
+            # Concatenate only non-None tensors
+            x_list = [
+                xi
+                for xi in [x_cond, x_uncond, x_uncond_cc, x_drop_all_cond]
+                if xi is not None
+            ]
+            x = torch.cat(x_list, dim=0)
+            t = torch.cat([t] * len(x_list), dim=0)
+            mask = torch.cat([mask] * len(x_list), dim=0) if mask is not None else None
+        else:
+            x, text_inner_sim_matrix = self.get_input_embed(
+                x,
+                cond,
+                text,
+                midi,
+                drop_audio_cond=drop_audio_cond,
+                drop_text=drop_text,
+                drop_midi=drop_midi,
+                cache=cache,
+                audio_mask=mask,
+            )
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+        if self.long_skip_connection is not None:
+            residual = x
+        # Mask is all zeros during inference
+        for block in self.transformer_blocks:
+            if self.checkpoint_activations:
+                x = torch.utils.checkpoint.checkpoint(
+                    self.ckpt_wrapper(block), x, t, mask, rope, use_reentrant=False
+                )
+            else:
+                x = block(x, t, mask=mask, rope=rope)
+        if self.long_skip_connection is not None:
+            x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
+        x = self.norm_out(x, t)
+        output = self.proj_out(x)
+        return output, text_inner_sim_matrix if not cfg_infer else None

src/YingMusicSinger/models/model.py ADDED Viewed

	@@ -0,0 +1,423 @@

+from __future__ import annotations
+from typing import Callable
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+from torchdiffeq import odeint
+from src.YingMusicSinger.melody.midi_extractor import MIDIExtractor
+from src.YingMusicSinger.utils.common import (
+    default,
+    exists,
+    get_epss_timesteps,
+    lens_to_mask,
+)
+def interpolation_midi_continuous(midi_p, bound_p, total_len):
+    """Temporally interpolate 3D melody latent to match target length."""
+    if midi_p.shape[1] != total_len:
+        midi = (
+            F.interpolate(
+                midi_p.clone().detach().transpose(1, 2),
+                size=total_len,
+                mode="linear",
+                align_corners=False,
+            )
+            .transpose(1, 2)
+            .clone()
+            .detach()
+        )
+        if bound_p is not None:
+            midi_bound = (
+                F.interpolate(
+                    bound_p.clone().detach().transpose(1, 2),
+                    size=total_len,
+                    mode="linear",
+                    align_corners=False,
+                )
+                .transpose(1, 2)
+                .clone()
+                .detach()
+            )
+    else:
+        midi = midi_p.clone().detach()
+        if bound_p is not None:
+            midi_bound = bound_p.clone().detach()
+    if bound_p is not None:
+        return midi, midi_bound
+    else:
+        return midi
+def interpolation_midi_continuous_2_dim(midi_p, bound_p, total_len):
+    """Temporally interpolate 2D melody latent to match target length."""
+    assert len(midi_p.shape) == 2
+    if midi_p.shape[1] != total_len:
+        midi = (
+            F.interpolate(
+                midi_p.unsqueeze(2).clone().detach().transpose(1, 2),
+                size=total_len,
+                mode="linear",
+                align_corners=False,
+            )
+            .transpose(1, 2)
+            .clone()
+            .detach()
+        )
+        if bound_p:
+            midi_bound = (
+                F.interpolate(
+                    bound_p.unsqueeze(2).clone().detach().transpose(1, 2),
+                    size=total_len,
+                    mode="linear",
+                    align_corners=False,
+                )
+                .transpose(1, 2)
+                .clone()
+                .detach()
+            )
+    else:
+        midi = midi_p.clone().detach()
+        if bound_p:
+            midi_bound = bound_p.clone().detach()
+    if bound_p:
+        return midi.squeeze(2), midi_bound.squeeze(2)
+    else:
+        return midi.squeeze(2)
+class Singer(nn.Module):
+    def __init__(
+        self,
+        transformer: nn.Module,
+        is_tts_pretrain,
+        melody_input_source,
+        cka_disabled,
+        distill_stage,
+        use_guidance_scale_embed,
+        sigma=0.0,
+        odeint_kwargs: dict = dict(method="euler"),
+        audio_drop_prob=0.3,
+        cond_drop_prob=0.2,
+        num_channels=None,
+        mel_spec_module: nn.Module | None = None,
+        mel_spec_kwargs: dict = dict(),
+        frac_lengths_mask: tuple[float, float] = (0.7, 1.0),
+        extra_parameters=None,
+    ):
+        super().__init__()
+        self.is_tts_pretrain = is_tts_pretrain
+        if distill_stage is None:
+            self.distill_stage = 0
+        else:
+            self.distill_stage = int(distill_stage)
+        self.use_guidance_scale_embed = use_guidance_scale_embed
+        assert melody_input_source in {
+            "student_model",
+            "some_pretrain",
+            "some_pretrain_fuzzdisturb",
+            "some_pretrain_postprocess_embedding",
+            "none",
+        }
+        from src.YingMusicSinger.melody.SmoothMelody import MIDIFuzzDisturb
+        if melody_input_source == "some_pretrain_fuzzdisturb":
+            self.smoothMelody_MIDIFuzzDisturb = MIDIFuzzDisturb(
+                dim=extra_parameters.some_pretrain_fuzzdisturb.dim,
+                drop_prob=extra_parameters.some_pretrain_fuzzdisturb.drop_prob,
+                noise_scale=extra_parameters.some_pretrain_fuzzdisturb.noise_scale,
+                blur_kernel=extra_parameters.some_pretrain_fuzzdisturb.blur_kernel,
+                drop_type=extra_parameters.some_pretrain_fuzzdisturb.drop_type,
+            )
+        from src.YingMusicSinger.melody.SmoothMelody import MIDIDigitalEmbedding
+        if melody_input_source == "some_pretrain_postprocess_embedding":
+            self.smoothMelody_MIDIDigitalEmbedding = MIDIDigitalEmbedding(
+                embed_dim=extra_parameters.some_pretrain_postprocess_embedding.embed_dim,
+                num_classes=extra_parameters.some_pretrain_postprocess_embedding.num_classes,
+                mark_distinguish_scale=extra_parameters.some_pretrain_postprocess_embedding.mark_distinguish_scale,
+            )
+        self.melody_input_source = melody_input_source
+        self.cka_disabled = cka_disabled
+        self.frac_lengths_mask = frac_lengths_mask
+        num_channels = default(num_channels, mel_spec_kwargs.n_mel_channels)
+        self.num_channels = num_channels
+        # Classifier-free guidance drop probabilities
+        self.audio_drop_prob = audio_drop_prob
+        self.cond_drop_prob = cond_drop_prob
+        # Transformer backbone
+        self.transformer = transformer
+        dim = transformer.dim
+        self.dim = dim
+        # Conditional flow matching
+        self.sigma = sigma
+        self.odeint_kwargs = odeint_kwargs
+        # Melody extractor
+        self.midi_extractor = MIDIExtractor(in_dim=num_channels)
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @torch.no_grad()
+    def sample(
+        self,
+        cond: float["b n d"] | float["b nw"],  # noqa: F722
+        text: int["b nt"] | list[str],  # noqa: F722
+        duration: int | int["b"] | None = None,  # noqa: F821
+        *,
+        midi_in: float["b n d"] | None = None,
+        lens: int["b"] | None = None,  # noqa: F821
+        steps=32,
+        cfg_strength=1.0,
+        sway_sampling_coef=None,
+        seed: int | None = None,
+        max_duration=4096,  # Maximum total length (including ICL prompt), ~190s
+        vocoder: Callable[[float["b d n"]], float["b nw"]] | None = None,  # noqa: F722
+        use_epss=True,
+        no_ref_audio=False,
+        duplicate_test=False,
+        t_inter=0.1,
+        t_shift=1.0,  # Sampling timestep shift (ZipVoice-style)
+        guidance_scale=None,
+        edit_mask=None,
+        midi_p=None,
+        bound_p=None,
+        enable_melody_control=True,
+    ):
+        self.eval()
+        assert isinstance(cond, torch.Tensor)
+        assert not edit_mask, "edit_mask is not supported in this mode"
+        assert not duplicate_test, "duplicate_test is not supported in this mode"
+        if self.melody_input_source == "student_model":
+            assert midi_p is None and bound_p is None
+        elif self.melody_input_source in {
+            "some_pretrain",
+            "some_pretrain_fuzzdisturb",
+            "some_pretrain_postprocess_embedding",
+        }:
+            assert midi_p is not None and bound_p is not None
+        elif self.melody_input_source == "none":
+            assert midi_p is None and bound_p is None
+        else:
+            raise ValueError(
+                f"Unsupported melody_input_source: {self.melody_input_source}"
+            )
+        # duration is the total latent sequence length
+        assert duration
+        cond = cond.to(next(self.parameters()).dtype)
+        # Extract or interpolate melody representation
+        if self.melody_input_source == "student_model":
+            midi, midi_bound = self.midi_extractor(midi_in)
+        elif self.melody_input_source == "some_pretrain":
+            midi, midi_bound = interpolation_midi_continuous(
+                midi_p=midi_p, bound_p=bound_p, total_len=text.shape[1]
+            )
+        elif self.melody_input_source == "some_pretrain_fuzzdisturb":
+            midi, midi_bound = interpolation_midi_continuous(
+                midi_p=midi_p, bound_p=bound_p, total_len=text.shape[1]
+            )
+            midi = self.smoothMelody_MIDIFuzzDisturb(midi)
+        elif self.melody_input_source == "some_pretrain_postprocess_embedding":
+            midi_after_postprocess, _ = self.midi_extractor.postprocess(
+                midi=midi_p, bounds=bound_p, with_expand=True
+            )
+            midi = interpolation_midi_continuous_2_dim(
+                midi_p=midi_after_postprocess, bound_p=None, total_len=text.shape[1]
+            )
+            midi = self.smoothMelody_MIDIDigitalEmbedding(midi)
+            midi_bound = None
+        elif self.melody_input_source == "none":
+            midi = torch.zeros(
+                text.shape[0], text.shape[1], 128, dtype=cond.dtype, device=text.device
+            )
+            midi_bound = None
+        else:
+            raise NotImplementedError()
+        batch, cond_seq_len, device = *cond.shape[:2], cond.device
+        if not exists(lens):
+            lens = torch.full((batch,), cond_seq_len, device=device, dtype=torch.long)
+        assert isinstance(text, torch.Tensor)
+        cond_mask = lens_to_mask(lens)
+        if edit_mask is not None:
+            cond_mask = cond_mask & edit_mask
+        if isinstance(duration, int):
+            duration = torch.full((batch,), duration, device=device, dtype=torch.long)
+        # Duration must be at least max(text_len, audio_prompt_len) + 1
+        duration = torch.maximum(
+            torch.maximum((text != 0).sum(dim=-1), lens) + 1, duration
+        )
+        duration = duration.clamp(max=max_duration)
+        max_duration = duration.amax()
+        # Duplicate test: interpolate between noise and conditioning
+        if duplicate_test:
+            test_cond = F.pad(
+                cond, (0, 0, cond_seq_len, max_duration - 2 * cond_seq_len), value=0.0
+            )
+        # Zero-pad conditioning latent to max_duration
+        cond = F.pad(cond, (0, 0, 0, max_duration - cond_seq_len), value=0.0)
+        if no_ref_audio:
+            cond = torch.zeros_like(cond)
+        cond_mask = F.pad(
+            cond_mask, (0, max_duration - cond_mask.shape[-1]), value=False
+        )
+        cond_mask = cond_mask.unsqueeze(-1)
+        step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
+        assert max_duration == midi.shape[1]
+        # Zero out melody in prompt region; optionally disable melody control entirely
+        if enable_melody_control:
+            midi = torch.where(cond_mask, torch.zeros_like(midi), midi)
+        else:
+            midi = torch.zeros_like(midi)
+        if self.is_tts_pretrain:
+            midi = torch.zeros_like(midi)
+        # For batched inference, explicit mask prevents causal attention fallback
+        if batch > 1:
+            mask = lens_to_mask(duration)
+        else:
+            mask = None
+        # ODE velocity function
+        def fn(t, x):
+            if cfg_strength < 1e-5:
+                # No classifier-free guidance
+                pred, _ = self.transformer(
+                    x=x,
+                    cond=step_cond,
+                    text=text,
+                    midi=midi,
+                    time=t,
+                    mask=mask,
+                    drop_audio_cond=False,
+                    drop_text=False,
+                    drop_midi=not enable_melody_control,
+                    cache=False,
+                )
+                return pred
+            else:
+                if self.use_guidance_scale_embed:
+                    # Distilled model with built-in CFG
+                    assert enable_melody_control
+                    pred_cfg, _ = self.transformer(
+                        x=x,
+                        cond=step_cond,
+                        text=text,
+                        midi=midi,
+                        time=t,
+                        mask=mask,
+                        drop_audio_cond=False,
+                        drop_text=False,
+                        drop_midi=not enable_melody_control,
+                        cache=False,
+                        guidance_scale=torch.tensor([guidance_scale], device=device),
+                    )
+                    print(
+                        f"CFG 参数调节无作用！ 蒸馏之后的，输入CFG为 guidance_scale={guidance_scale}"
+                    )
+                    return pred_cfg
+                else:
+                    # Standard CFG: cond + uncond forward
+                    # BUG If enable_melody_control is False, there might be a slight issue here
+                    assert guidance_scale is not None
+                    pred_cfg, _ = self.transformer(
+                        x=x,
+                        cond=step_cond,
+                        text=text,
+                        midi=midi,
+                        time=t,
+                        mask=mask,
+                        cfg_infer=True,
+                        cache=False,
+                        cfg_infer_ids=(True, False, False, True),
+                    )
+                    pred, pred_drop_all_cond = torch.chunk(pred_cfg, 2, dim=0)
+                    return pred + (pred - pred_drop_all_cond) * float(guidance_scale)
+        # Generate initial noise (per-sample seeding for batch consistency)
+        y0 = []
+        for dur in duration:
+            if exists(seed):
+                torch.manual_seed(seed)
+            y0.append(
+                torch.randn(
+                    dur, self.num_channels, device=self.device, dtype=step_cond.dtype
+                )
+            )
+        y0 = pad_sequence(y0, padding_value=0, batch_first=True)
+        t_start = 0
+        if duplicate_test:
+            t_start = t_inter
+            y0 = (1 - t_start) * y0 + t_start * test_cond
+            steps = int(steps * (1 - t_start))
+        # Build timestep schedule
+        assert not use_epss and sway_sampling_coef is None, (
+            "Use timestep shift instead of the strategy in F5"
+        )
+        if t_start == 0 and use_epss:
+            # Empirically Pruned Step Sampling for low NFE
+            t = get_epss_timesteps(steps, device=self.device, dtype=step_cond.dtype)
+        else:
+            t = torch.linspace(
+                t_start, 1, steps + 1, device=self.device, dtype=step_cond.dtype
+            )
+        if sway_sampling_coef is not None:
+            t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+        # Apply timestep shift
+        t = t_shift * t / (1 + (t_shift - 1) * t)
+        trajectory = odeint(fn, y0, t, **self.odeint_kwargs)
+        self.transformer.clear_cache()
+        sampled = trajectory[-1]
+        out = sampled
+        if exists(vocoder):
+            out = out.permute(0, 2, 1)
+            out = vocoder(out)
+        return out, trajectory

src/YingMusicSinger/models/modules.py ADDED Viewed

	@@ -0,0 +1,961 @@

+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from librosa.filters import mel as librosa_mel_fn
+from x_transformers.x_transformers import apply_rotary_pos_emb
+from src.YingMusicSinger.utils.common import is_package_available
+# raw wav to mel spec
+mel_basis_cache = {}
+hann_window_cache = {}
+def get_bigvgan_mel_spectrogram(
+    waveform,
+    n_fft=1024,
+    n_mel_channels=100,
+    target_sample_rate=24000,
+    hop_length=256,
+    win_length=1024,
+    fmin=0,
+    fmax=None,
+    center=False,
+):  # Copy from https://github.com/NVIDIA/BigVGAN/tree/main
+    device = waveform.device
+    key = f"{n_fft}_{n_mel_channels}_{target_sample_rate}_{hop_length}_{win_length}_{fmin}_{fmax}_{device}"
+    if key not in mel_basis_cache:
+        mel = librosa_mel_fn(
+            sr=target_sample_rate,
+            n_fft=n_fft,
+            n_mels=n_mel_channels,
+            fmin=fmin,
+            fmax=fmax,
+        )
+        mel_basis_cache[key] = (
+            torch.from_numpy(mel).float().to(device)
+        )  # TODO: why they need .float()?
+        hann_window_cache[key] = torch.hann_window(win_length).to(device)
+    mel_basis = mel_basis_cache[key]
+    hann_window = hann_window_cache[key]
+    padding = (n_fft - hop_length) // 2
+    waveform = torch.nn.functional.pad(
+        waveform.unsqueeze(1), (padding, padding), mode="reflect"
+    ).squeeze(1)
+    spec = torch.stft(
+        waveform,
+        n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=hann_window,
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
+    mel_spec = torch.matmul(mel_basis, spec)
+    mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
+    return mel_spec
+def get_vocos_mel_spectrogram(
+    waveform,
+    n_fft=1024,
+    n_mel_channels=100,
+    target_sample_rate=24000,
+    hop_length=256,
+    win_length=1024,
+):
+    mel_stft = torchaudio.transforms.MelSpectrogram(
+        sample_rate=target_sample_rate,
+        n_fft=n_fft,
+        win_length=win_length,
+        hop_length=hop_length,
+        n_mels=n_mel_channels,
+        power=1,
+        center=True,
+        normalized=False,
+        norm=None,
+    ).to(waveform.device)
+    if len(waveform.shape) == 3:
+        waveform = waveform.squeeze(1)  # 'b 1 nw -> b nw'
+    assert len(waveform.shape) == 2
+    mel = mel_stft(waveform)
+    mel = mel.clamp(min=1e-5).log()
+    return mel
+class MelSpec(nn.Module):
+    def __init__(
+        self,
+        n_fft=1024,
+        hop_length=256,
+        win_length=1024,
+        n_mel_channels=100,
+        target_sample_rate=24_000,
+        mel_spec_type="vocos",
+    ):
+        super().__init__()
+        assert mel_spec_type in ["vocos", "bigvgan"], print(
+            "We only support two extract mel backend: vocos or bigvgan"
+        )
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.n_mel_channels = n_mel_channels
+        self.target_sample_rate = target_sample_rate
+        if mel_spec_type == "vocos":
+            self.extractor = get_vocos_mel_spectrogram
+        elif mel_spec_type == "bigvgan":
+            self.extractor = get_bigvgan_mel_spectrogram
+        self.register_buffer("dummy", torch.tensor(0), persistent=False)
+    def forward(self, wav):
+        if self.dummy.device != wav.device:
+            self.to(wav.device)
+        mel = self.extractor(
+            waveform=wav,
+            n_fft=self.n_fft,
+            n_mel_channels=self.n_mel_channels,
+            target_sample_rate=self.target_sample_rate,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+        )
+        return mel
+# sinusoidal position embedding
+class SinusPositionEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x, scale=1000):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
+        emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# convolutional position embedding
+class ConvPositionEmbedding(nn.Module):
+    def __init__(self, dim, kernel_size=31, groups=16):
+        super().__init__()
+        assert kernel_size % 2 != 0
+        self.conv1d = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2),
+            nn.Mish(),
+        )
+    def forward(self, x: float["b n d"], mask: bool["b n"] | None = None):
+        if mask is not None:
+            mask = mask[..., None]
+            x = x.masked_fill(~mask, 0.0)
+        x = x.permute(0, 2, 1)
+        x = self.conv1d(x)
+        out = x.permute(0, 2, 1)
+        if mask is not None:
+            out = out.masked_fill(~mask, 0.0)
+        return out
+# rotary positional embedding related
+def precompute_freqs_cis(
+    dim: int, end: int, theta: float = 10000.0, theta_rescale_factor=1.0
+):
+    # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+    # has some connection to NTK literature
+    # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+    # https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
+    theta *= theta_rescale_factor ** (dim / (dim - 2))
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cos = torch.cos(freqs)  # real part
+    freqs_sin = torch.sin(freqs)  # imaginary part
+    return torch.cat([freqs_cos, freqs_sin], dim=-1)
+def get_pos_embed_indices(start, length, max_pos, scale=1.0):
+    # length = length if isinstance(length, int) else length.max()
+    scale = scale * torch.ones_like(
+        start, dtype=torch.float32
+    )  # in case scale is a scalar
+    pos = (
+        start.unsqueeze(1)
+        + (
+            torch.arange(length, device=start.device, dtype=torch.float32).unsqueeze(0)
+            * scale.unsqueeze(1)
+        ).long()
+    )
+    # avoid extra long error.
+    pos = torch.where(pos < max_pos, pos, max_pos - 1)
+    return pos
+# Global Response Normalization layer (Instance Normalization ?)
+class GRN(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, dim))
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=1, keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+# ConvNeXt-V2 Block https://github.com/facebookresearch/ConvNeXt-V2/blob/main/models/convnextv2.py
+# ref: https://github.com/bfs18/e2_tts/blob/main/rfwave/modules.py#L108
+class ConvNeXtV2Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        dilation: int = 1,
+    ):
+        super().__init__()
+        padding = (dilation * (7 - 1)) // 2
+        self.dwconv = nn.Conv1d(
+            dim, dim, kernel_size=7, padding=padding, groups=dim, dilation=dilation
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(intermediate_dim)
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = x.transpose(1, 2)  # b n d -> b d n
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # b d n -> b n d
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        return residual + x
+# RMSNorm
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.native_rms_norm = float(torch.__version__[:3]) >= 2.4
+    def forward(self, x):
+        if self.native_rms_norm:
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                x = x.to(self.weight.dtype)
+            x = F.rms_norm(
+                x, normalized_shape=(x.shape[-1],), weight=self.weight, eps=self.eps
+            )
+        else:
+            variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                x = x.to(self.weight.dtype)
+            x = x * self.weight
+        return x
+# AdaLayerNorm
+# return with modulated x for attn input, and params for later mlp modulation
+class AdaLayerNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 6)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb=None):
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = torch.chunk(
+            emb, 6, dim=1
+        )
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+# AdaLayerNorm for final layer
+# return only with modulated x for attn input, cuz no more mlp modulation
+class AdaLayerNorm_Final(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(dim, dim * 2)
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.linear(self.silu(emb))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+# FeedForward
+class FeedForward(nn.Module):
+    def __init__(
+        self, dim, dim_out=None, mult=4, dropout=0.0, approximate: str = "none"
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        activation = nn.GELU(approximate=approximate)
+        project_in = nn.Sequential(nn.Linear(dim, inner_dim), activation)
+        self.ff = nn.Sequential(
+            project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out)
+        )
+    def forward(self, x):
+        return self.ff(x)
+# Attention with possible joint part
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class Attention(nn.Module):
+    def __init__(
+        self,
+        processor: JointAttnProcessor | AttnProcessor,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        context_dim: Optional[int] = None,  # if not None -> joint attention
+        context_pre_only: bool = False,
+        qk_norm: Optional[str] = None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "Attention equires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+        self.processor = processor
+        self.dim = dim
+        self.heads = heads
+        self.inner_dim = dim_head * heads
+        self.dropout = dropout
+        self.context_dim = context_dim
+        self.context_pre_only = context_pre_only
+        self.to_q = nn.Linear(dim, self.inner_dim)
+        self.to_k = nn.Linear(dim, self.inner_dim)
+        self.to_v = nn.Linear(dim, self.inner_dim)
+        if qk_norm is None:
+            self.q_norm = None
+            self.k_norm = None
+        elif qk_norm == "rms_norm":
+            self.q_norm = RMSNorm(dim_head, eps=1e-6)
+            self.k_norm = RMSNorm(dim_head, eps=1e-6)
+        else:
+            raise ValueError(f"Unimplemented qk_norm: {qk_norm}")
+        if self.context_dim is not None:
+            self.to_q_c = nn.Linear(context_dim, self.inner_dim)
+            self.to_k_c = nn.Linear(context_dim, self.inner_dim)
+            self.to_v_c = nn.Linear(context_dim, self.inner_dim)
+            if qk_norm is None:
+                self.c_q_norm = None
+                self.c_k_norm = None
+            elif qk_norm == "rms_norm":
+                self.c_q_norm = RMSNorm(dim_head, eps=1e-6)
+                self.c_k_norm = RMSNorm(dim_head, eps=1e-6)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(self.inner_dim, dim))
+        self.to_out.append(nn.Dropout(dropout))
+        if self.context_dim is not None and not self.context_pre_only:
+            self.to_out_c = nn.Linear(self.inner_dim, context_dim)
+    def forward(
+        self,
+        x: float["b n d"],  # noised input x
+        c: float["b n d"] = None,  # context c
+        mask: bool["b n"] | None = None,
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.Tensor:
+        if c is not None:
+            return self.processor(self, x, c=c, mask=mask, rope=rope, c_rope=c_rope)
+        else:
+            return self.processor(self, x, mask=mask, rope=rope)
+# Attention processor
+if is_package_available("flash_attn"):
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import pad_input, unpad_input
+class AttnProcessor:
+    def __init__(
+        self,
+        pe_attn_head: int
+        | None = None,  # number of attention head to apply rope, None for all
+        attn_backend: str = "torch",  # "torch" or "flash_attn"
+        attn_mask_enabled: bool = True,
+    ):
+        if attn_backend == "flash_attn":
+            assert is_package_available("flash_attn"), (
+                "Please install flash-attn first."
+            )
+        self.pe_attn_head = pe_attn_head
+        self.attn_backend = attn_backend
+        self.attn_mask_enabled = attn_mask_enabled
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x
+        mask: bool["b n"] | None = None,
+        rope=None,  # rotary position embedding
+    ) -> torch.FloatTensor:
+        batch_size = x.shape[0]
+        # `sample` projections
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # qk norm
+        if attn.q_norm is not None:
+            query = attn.q_norm(query)
+        if attn.k_norm is not None:
+            key = attn.k_norm(key)
+        # apply rotary position embedding
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (
+                (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            )
+            if self.pe_attn_head is not None:
+                pn = self.pe_attn_head
+                query[:, :pn, :, :] = apply_rotary_pos_emb(
+                    query[:, :pn, :, :], freqs, q_xpos_scale
+                )
+                key[:, :pn, :, :] = apply_rotary_pos_emb(
+                    key[:, :pn, :, :], freqs, k_xpos_scale
+                )
+            else:
+                query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+                key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        if self.attn_backend == "torch":
+            # mask. e.g. inference got a batch with different target durations, mask out the padding
+            if self.attn_mask_enabled and mask is not None:
+                attn_mask = mask
+                attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+                attn_mask = attn_mask.expand(
+                    batch_size, attn.heads, query.shape[-2], key.shape[-2]
+                )
+            else:
+                attn_mask = None
+            x = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False
+            )
+            x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        elif self.attn_backend == "flash_attn":
+            query = query.transpose(1, 2)  # [b, h, n, d] -> [b, n, h, d]
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+            if self.attn_mask_enabled and mask is not None:
+                query, indices, q_cu_seqlens, q_max_seqlen_in_batch, _ = unpad_input(
+                    query, mask
+                )
+                key, _, k_cu_seqlens, k_max_seqlen_in_batch, _ = unpad_input(key, mask)
+                value, _, _, _, _ = unpad_input(value, mask)
+                x = flash_attn_varlen_func(
+                    query,
+                    key,
+                    value,
+                    q_cu_seqlens,
+                    k_cu_seqlens,
+                    q_max_seqlen_in_batch,
+                    k_max_seqlen_in_batch,
+                )
+                x = pad_input(x, indices, batch_size, q_max_seqlen_in_batch)
+                x = x.reshape(batch_size, -1, attn.heads * head_dim)
+            else:
+                x = flash_attn_func(query, key, value, dropout_p=0.0, causal=False)
+                x = x.reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+        return x
+# Joint Attention processor for MM-DiT
+# modified from diffusers/src/diffusers/models/attention_processor.py
+class JointAttnProcessor:
+    def __init__(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        x: float["b n d"],  # noised input x
+        c: float["b nt d"] = None,  # context c, here text
+        mask: bool["b n"] | None = None,
+        rope=None,  # rotary position embedding for x
+        c_rope=None,  # rotary position embedding for c
+    ) -> torch.FloatTensor:
+        residual = x
+        batch_size = c.shape[0]
+        # `sample` projections
+        query = attn.to_q(x)
+        key = attn.to_k(x)
+        value = attn.to_v(x)
+        # `context` projections
+        c_query = attn.to_q_c(c)
+        c_key = attn.to_k_c(c)
+        c_value = attn.to_v_c(c)
+        # attention
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        c_query = c_query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        c_key = c_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        c_value = c_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # qk norm
+        if attn.q_norm is not None:
+            query = attn.q_norm(query)
+        if attn.k_norm is not None:
+            key = attn.k_norm(key)
+        if attn.c_q_norm is not None:
+            c_query = attn.c_q_norm(c_query)
+        if attn.c_k_norm is not None:
+            c_key = attn.c_k_norm(c_key)
+        # apply rope for context and noised input independently
+        if rope is not None:
+            freqs, xpos_scale = rope
+            q_xpos_scale, k_xpos_scale = (
+                (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            )
+            query = apply_rotary_pos_emb(query, freqs, q_xpos_scale)
+            key = apply_rotary_pos_emb(key, freqs, k_xpos_scale)
+        if c_rope is not None:
+            freqs, xpos_scale = c_rope
+            q_xpos_scale, k_xpos_scale = (
+                (xpos_scale, xpos_scale**-1.0) if xpos_scale is not None else (1.0, 1.0)
+            )
+            c_query = apply_rotary_pos_emb(c_query, freqs, q_xpos_scale)
+            c_key = apply_rotary_pos_emb(c_key, freqs, k_xpos_scale)
+        # joint attention
+        query = torch.cat([query, c_query], dim=2)
+        key = torch.cat([key, c_key], dim=2)
+        value = torch.cat([value, c_value], dim=2)
+        # mask. e.g. inference got a batch with different target durations, mask out the padding
+        if mask is not None:
+            attn_mask = F.pad(mask, (0, c.shape[1]), value=True)  # no mask for c (text)
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(1)  # 'b n -> b 1 1 n'
+            attn_mask = attn_mask.expand(
+                batch_size, attn.heads, query.shape[-2], key.shape[-2]
+            )
+        else:
+            attn_mask = None
+        x = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attn_mask, dropout_p=0.0, is_causal=False
+        )
+        x = x.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        x = x.to(query.dtype)
+        # Split the attention outputs.
+        x, c = (
+            x[:, : residual.shape[1]],
+            x[:, residual.shape[1] :],
+        )
+        # linear proj
+        x = attn.to_out[0](x)
+        # dropout
+        x = attn.to_out[1](x)
+        if not attn.context_pre_only:
+            c = attn.to_out_c(c)
+        if mask is not None:
+            mask = mask.unsqueeze(-1)
+            x = x.masked_fill(~mask, 0.0)
+            # c = c.masked_fill(~mask, 0.)  # no mask for c (text)
+        return x, c
+# DiT Block
+class DiTBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        heads,
+        dim_head,
+        ff_mult=4,
+        dropout=0.1,
+        qk_norm=None,
+        pe_attn_head=None,
+        attn_backend="torch",  # "torch" or "flash_attn"
+        attn_mask_enabled=True,
+    ):
+        super().__init__()
+        self.attn_norm = AdaLayerNorm(dim)
+        self.attn = Attention(
+            processor=AttnProcessor(
+                pe_attn_head=pe_attn_head,
+                attn_backend=attn_backend,
+                attn_mask_enabled=attn_mask_enabled,
+            ),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            qk_norm=qk_norm,
+        )
+        self.ff_norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(
+            dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh"
+        )
+    def forward(self, x, t, mask=None, rope=None):  # x: noised input, t: time embedding
+        # pre-norm & modulation for attention input
+        norm, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.attn_norm(x, emb=t)
+        # attention
+        attn_output = self.attn(x=norm, mask=mask, rope=rope)
+        # process attention output for input x
+        x = x + gate_msa.unsqueeze(1) * attn_output
+        norm = self.ff_norm(x) * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm)
+        x = x + gate_mlp.unsqueeze(1) * ff_output
+        return x
+# MMDiT Block https://arxiv.org/abs/2403.03206
+class MMDiTBlock(nn.Module):
+    r"""
+    modified from diffusers/src/diffusers/models/attention.py
+    notes.
+    _c: context related. text, cond, etc. (left part in sd3 fig2.b)
+    _x: noised input related. (right part)
+    context_pre_only: last layer only do prenorm + modulation cuz no more ffn
+    """
+    def __init__(
+        self,
+        dim,
+        heads,
+        dim_head,
+        ff_mult=4,
+        dropout=0.1,
+        context_dim=None,
+        context_pre_only=False,
+        qk_norm=None,
+    ):
+        super().__init__()
+        if context_dim is None:
+            context_dim = dim
+        self.context_pre_only = context_pre_only
+        self.attn_norm_c = (
+            AdaLayerNorm_Final(context_dim)
+            if context_pre_only
+            else AdaLayerNorm(context_dim)
+        )
+        self.attn_norm_x = AdaLayerNorm(dim)
+        self.attn = Attention(
+            processor=JointAttnProcessor(),
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            dropout=dropout,
+            context_dim=context_dim,
+            context_pre_only=context_pre_only,
+            qk_norm=qk_norm,
+        )
+        if not context_pre_only:
+            self.ff_norm_c = nn.LayerNorm(
+                context_dim, elementwise_affine=False, eps=1e-6
+            )
+            self.ff_c = FeedForward(
+                dim=context_dim, mult=ff_mult, dropout=dropout, approximate="tanh"
+            )
+        else:
+            self.ff_norm_c = None
+            self.ff_c = None
+        self.ff_norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_x = FeedForward(
+            dim=dim, mult=ff_mult, dropout=dropout, approximate="tanh"
+        )
+    def forward(
+        self, x, c, t, mask=None, rope=None, c_rope=None
+    ):  # x: noised input, c: context, t: time embedding
+        # pre-norm & modulation for attention input
+        if self.context_pre_only:
+            norm_c = self.attn_norm_c(c, t)
+        else:
+            norm_c, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.attn_norm_c(
+                c, emb=t
+            )
+        norm_x, x_gate_msa, x_shift_mlp, x_scale_mlp, x_gate_mlp = self.attn_norm_x(
+            x, emb=t
+        )
+        # attention
+        x_attn_output, c_attn_output = self.attn(
+            x=norm_x, c=norm_c, mask=mask, rope=rope, c_rope=c_rope
+        )
+        # process attention output for context c
+        if self.context_pre_only:
+            c = None
+        else:  # if not last layer
+            c = c + c_gate_msa.unsqueeze(1) * c_attn_output
+            norm_c = (
+                self.ff_norm_c(c) * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+            )
+            c_ff_output = self.ff_c(norm_c)
+            c = c + c_gate_mlp.unsqueeze(1) * c_ff_output
+        # process attention output for input x
+        x = x + x_gate_msa.unsqueeze(1) * x_attn_output
+        norm_x = self.ff_norm_x(x) * (1 + x_scale_mlp[:, None]) + x_shift_mlp[:, None]
+        x_ff_output = self.ff_x(norm_x)
+        x = x + x_gate_mlp.unsqueeze(1) * x_ff_output
+        return c, x
+# time step conditioning embedding
+# class TimestepEmbedding(nn.Module):
+#     def __init__(self, dim, freq_embed_dim=256):
+#         super().__init__()
+#         self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+#         self.time_mlp = nn.Sequential(nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
+#     def forward(self, timestep: float["b"]):
+#         time_hidden = self.time_embed(timestep)
+#         time_hidden = time_hidden.to(timestep.dtype)
+#         time = self.time_mlp(time_hidden)  # b d
+#         return time
+def zipvoice_timestep_embedding(timesteps, dim, max_period=10000):
+    """Create sinusoidal timestep embeddings.
+    :param timesteps: shape of (N) or (N, T)
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an Tensor of positional embeddings. shape of (N, dim) or (T, N, dim)
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device)
+        / half
+    )
+    if timesteps.dim() == 2:
+        timesteps = timesteps.transpose(0, 1)  # (N, T) -> (T, N)
+    args = timesteps[..., None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[..., :1])], dim=-1)
+    return embedding
+def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
+    """
+    Behaves like a constructor of a modified version of nn.Linear
+    that gives an easy way to set the default initial parameter scale.
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+    ans = nn.Linear(*args, **kwargs)
+    with torch.no_grad():
+        ans.weight[:] *= initial_scale
+        if ans.bias is not None:
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, 0.1 * initial_scale)
+    return ans
+# 在蒸馏的时候使用！
+class TimestepGuidanceEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        freq_embed_dim=256,
+        use_guidance_scale_embed=False,
+        guidance_scale_embed_dim=192,
+    ):
+        super().__init__()
+        self.time_embed = SinusPositionEmbedding(freq_embed_dim)
+        self.time_mlp = nn.Sequential(
+            nn.Linear(freq_embed_dim, dim), nn.SiLU(), nn.Linear(dim, dim)
+        )
+        if use_guidance_scale_embed:
+            self.guidance_scale_embed = ScaledLinear(
+                guidance_scale_embed_dim,
+                freq_embed_dim,
+                bias=False,
+                initial_scale=0.1,
+            )
+            self.guidance_scale_embed_dim = guidance_scale_embed_dim
+        else:
+            self.guidance_scale_embed = None
+    def forward(self, timestep: float["b"], guidance_scale=None):
+        # import pdb
+        # pdb.set_trace()
+        time_hidden = self.time_embed(timestep)
+        if self.guidance_scale_embed:
+            assert guidance_scale is not None
+            guidance_scale_emb = self.guidance_scale_embed(
+                zipvoice_timestep_embedding(
+                    guidance_scale, self.guidance_scale_embed_dim
+                )
+            )
+            time_hidden = time_hidden + guidance_scale_emb
+        else:
+            assert guidance_scale is None
+        time_hidden = time_hidden.to(timestep.dtype)
+        time = self.time_mlp(time_hidden)  # b d
+        return time

src/YingMusicSinger/utils/f5_tts/g2p/g2p/__init__.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import re
+from tokenizers import Tokenizer
+from src.YingMusicSinger.utils.f5_tts.g2p.g2p import cleaners
+from src.YingMusicSinger.utils.f5_tts.g2p.g2p.text_tokenizers import TextTokenizer
+# import LangSegment
+from src.YingMusicSinger.utils.f5_tts.thirdparty.LangSegment import LangSegment
+class PhonemeBpeTokenizer:
+    def __init__(
+        self, vacab_path="./src/YingMusicSinger/utils/f5_tts/g2p/g2p/vocab.json"
+    ):
+        self.lang2backend = {
+            "zh": "cmn",
+            "ja": "ja",
+            "en": "en-us",
+            "fr": "fr-fr",
+            "ko": "ko",
+            "de": "de",
+        }
+        self.text_tokenizers = {}
+        self.int_text_tokenizers()
+        with open(vacab_path, "r") as f:
+            json_data = f.read()
+        data = json.loads(json_data)
+        self.vocab = data["vocab"]
+        LangSegment.setfilters(["en", "zh", "ja", "ko", "fr", "de"])
+    def int_text_tokenizers(self):
+        for key, value in self.lang2backend.items():
+            self.text_tokenizers[key] = TextTokenizer(language=value)
+    def tokenize(self, text, sentence, language):
+        # 1. convert text to phoneme
+        phonemes = []
+        if language == "auto":
+            seglist = LangSegment.getTexts(text)
+            tmp_ph = []
+            for seg in seglist:
+                tmp_ph.append(
+                    self._clean_text(
+                        seg["text"], sentence, seg["lang"], ["cjekfd_cleaners"]
+                    )
+                )
+            phonemes = "|_|".join(tmp_ph)
+        else:
+            phonemes = self._clean_text(text, sentence, language, ["cjekfd_cleaners"])
+        # print('clean text: ', phonemes)
+        # 2. tokenize phonemes
+        phoneme_tokens = self.phoneme2token(phonemes)
+        # print('encode: ', phoneme_tokens)
+        # # 3. decode tokens [optional]
+        # decoded_text = self.tokenizer.decode(phoneme_tokens)
+        # print('decoded: ', decoded_text)
+        return phonemes, phoneme_tokens
+    def _clean_text(self, text, sentence, language, cleaner_names):
+        for name in cleaner_names:
+            cleaner = getattr(cleaners, name)
+            if not cleaner:
+                raise Exception("Unknown cleaner: %s" % name)
+        text = cleaner(text, sentence, language, self.text_tokenizers)
+        return text
+    def phoneme2token(self, phonemes):
+        tokens = []
+        if isinstance(phonemes, list):
+            for phone in phonemes:
+                phone = phone.split("\t")[0]
+                phonemes_split = phone.split("|")
+                tokens.append(
+                    [self.vocab[p] for p in phonemes_split if p in self.vocab]
+                )
+        else:
+            phonemes = phonemes.split("\t")[0]
+            phonemes_split = phonemes.split("|")
+            tokens = [self.vocab[p] for p in phonemes_split if p in self.vocab]
+        return tokens

src/YingMusicSinger/utils/f5_tts/g2p/g2p/chinese_model_g2p.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import numpy as np
+import torch
+from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
+from torch.utils.data import DataLoader, Dataset
+from transformers import BertTokenizer
+from transformers.models.bert.modeling_bert import *
+class PolyDataset(Dataset):
+    def __init__(self, words, labels, word_pad_idx=0, label_pad_idx=-1):
+        self.dataset = self.preprocess(words, labels)
+        self.word_pad_idx = word_pad_idx
+        self.label_pad_idx = label_pad_idx
+    def preprocess(self, origin_sentences, origin_labels):
+        """
+        Maps tokens and tags to their indices and stores them in the dict data.
+        examples:
+            word:['[CLS]', '浙', '商', '银', '行', '企', '业', '信', '贷', '部']
+            sentence:([101, 3851, 1555, 7213, 6121, 821, 689, 928, 6587, 6956],
+                        array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]))
+            label:[3, 13, 13, 13, 0, 0, 0, 0, 0]
+        """
+        data = []
+        labels = []
+        sentences = []
+        # tokenize
+        for line in origin_sentences:
+            # replace each token by its index
+            # we can not use encode_plus because our sentences are aligned to labels in list type
+            words = []
+            word_lens = []
+            for token in line:
+                words.append(token)
+                word_lens.append(1)
+            token_start_idxs = 1 + np.cumsum([0] + word_lens[:-1])
+            sentences.append(((words, token_start_idxs), 0))
+        ###
+        for tag in origin_labels:
+            labels.append(tag)
+        for sentence, label in zip(sentences, labels):
+            data.append((sentence, label))
+        return data
+    def __getitem__(self, idx):
+        """sample data to get batch"""
+        word = self.dataset[idx][0]
+        label = self.dataset[idx][1]
+        return [word, label]
+    def __len__(self):
+        """get dataset size"""
+        return len(self.dataset)
+    def collate_fn(self, batch):
+        sentences = [x[0][0] for x in batch]
+        ori_sents = [x[0][1] for x in batch]
+        labels = [x[1] for x in batch]
+        batch_len = len(sentences)
+        # compute length of longest sentence in batch
+        max_len = max([len(s[0]) for s in sentences])
+        max_label_len = 0
+        batch_data = np.ones((batch_len, max_len))
+        batch_label_starts = []
+        # padding and aligning
+        for j in range(batch_len):
+            cur_len = len(sentences[j][0])
+            batch_data[j][:cur_len] = sentences[j][0]
+            label_start_idx = sentences[j][-1]
+            label_starts = np.zeros(max_len)
+            label_starts[[idx for idx in label_start_idx if idx < max_len]] = 1
+            batch_label_starts.append(label_starts)
+            max_label_len = max(int(sum(label_starts)), max_label_len)
+        # padding label
+        batch_labels = self.label_pad_idx * np.ones((batch_len, max_label_len))
+        batch_pmasks = self.label_pad_idx * np.ones((batch_len, max_label_len))
+        for j in range(batch_len):
+            cur_tags_len = len(labels[j])
+            batch_labels[j][:cur_tags_len] = labels[j]
+            batch_pmasks[j][:cur_tags_len] = [
+                1 if item > 0 else 0 for item in labels[j]
+            ]
+        # convert data to torch LongTensors
+        batch_data = torch.tensor(batch_data, dtype=torch.long)
+        batch_label_starts = torch.tensor(batch_label_starts, dtype=torch.long)
+        batch_labels = torch.tensor(batch_labels, dtype=torch.long)
+        batch_pmasks = torch.tensor(batch_pmasks, dtype=torch.long)
+        return [batch_data, batch_label_starts, batch_labels, batch_pmasks, ori_sents]
+class BertPolyPredict:
+    def __init__(self, bert_model, jsonr_file, json_file):
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
+        with open(jsonr_file, "r", encoding="utf8") as fp:
+            self.pron_dict = json.load(fp)
+        with open(json_file, "r", encoding="utf8") as fp:
+            self.pron_dict_id_2_pinyin = json.load(fp)
+        self.num_polyphone = len(self.pron_dict)
+        self.device = "cpu"
+        self.polydataset = PolyDataset
+        options = SessionOptions()  # initialize session options
+        options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+        print(os.path.join(bert_model, "poly_bert_model.onnx"))
+        self.session = InferenceSession(
+            os.path.join(bert_model, "poly_bert_model.onnx"),
+            sess_options=options,
+            providers=[
+                "CPUExecutionProvider",
+                "CUDAExecutionProvider",
+            ],  # CPUExecutionProvider #CUDAExecutionProvider
+        )
+        # self.session.set_providers(['CUDAExecutionProvider', "CPUExecutionProvider"], [ {'device_id': 0}])
+        # disable session.run() fallback mechanism, it prevents for a reset of the execution provider
+        self.session.disable_fallback()
+    def predict_process(self, txt_list):
+        word_test, label_test, texts_test = self.get_examples_po(txt_list)
+        data = self.polydataset(word_test, label_test)
+        predict_loader = DataLoader(
+            data, batch_size=1, shuffle=False, collate_fn=data.collate_fn
+        )
+        pred_tags = self.predict_onnx(predict_loader)
+        return pred_tags
+    def predict_onnx(self, dev_loader):
+        pred_tags = []
+        with torch.no_grad():
+            for idx, batch_samples in enumerate(dev_loader):
+                # [batch_data, batch_label_starts, batch_labels, batch_pmasks, ori_sents]
+                batch_data, batch_label_starts, batch_labels, batch_pmasks, _ = (
+                    batch_samples
+                )
+                # shift tensors to GPU if available
+                batch_data = batch_data.to(self.device)
+                batch_label_starts = batch_label_starts.to(self.device)
+                batch_labels = batch_labels.to(self.device)
+                batch_pmasks = batch_pmasks.to(self.device)
+                batch_data = np.asarray(batch_data, dtype=np.int32)
+                batch_pmasks = np.asarray(batch_pmasks, dtype=np.int32)
+                # batch_output = self.session.run(output_names=['outputs'], input_feed={"input_ids":batch_data, "input_pmasks": batch_pmasks})[0][0]
+                batch_output = self.session.run(
+                    output_names=["outputs"], input_feed={"input_ids": batch_data}
+                )[0]
+                label_masks = batch_pmasks == 1
+                batch_labels = batch_labels.to("cpu").numpy()
+                for i, indices in enumerate(np.argmax(batch_output, axis=2)):
+                    for j, idx in enumerate(indices):
+                        if label_masks[i][j]:
+                            # pred_tag.append(idx)
+                            pred_tags.append(self.pron_dict_id_2_pinyin[str(idx + 1)])
+        return pred_tags
+    def get_examples_po(self, text_list):
+        word_list = []
+        label_list = []
+        sentence_list = []
+        id = 0
+        for line in [text_list]:
+            sentence = line[0]
+            words = []
+            tokens = line[0]
+            index = line[-1]
+            front = index
+            back = len(tokens) - index - 1
+            labels = [0] * front + [1] + [0] * back
+            words = ["[CLS]"] + [item for item in sentence]
+            words = self.tokenizer.convert_tokens_to_ids(words)
+            word_list.append(words)
+            label_list.append(labels)
+            sentence_list.append(sentence)
+            id += 1
+            # mask_list.append(masks)
+            assert len(labels) + 1 == len(words), print(
+                (
+                    poly,
+                    sentence,
+                    words,
+                    labels,
+                    sentence,
+                    len(sentence),
+                    len(words),
+                    len(labels),
+                )
+            )
+            assert len(labels) + 1 == len(words), (
+                "Number of labels does not match number of words"
+            )
+            assert len(labels) == len(sentence), (
+                "Number of labels does not match number of sentences"
+            )
+            assert len(word_list) == len(label_list), (
+                "Number of label sentences does not match number of word sentences"
+            )
+        return word_list, label_list, text_list

src/YingMusicSinger/utils/f5_tts/g2p/g2p/cleaners.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from src.YingMusicSinger.utils.f5_tts.g2p.g2p.english import english_to_ipa
+from src.YingMusicSinger.utils.f5_tts.g2p.g2p.french import french_to_ipa
+from src.YingMusicSinger.utils.f5_tts.g2p.g2p.german import german_to_ipa
+from src.YingMusicSinger.utils.f5_tts.g2p.g2p.korean import korean_to_ipa
+from src.YingMusicSinger.utils.f5_tts.g2p.g2p.mandarin import chinese_to_ipa
+def cjekfd_cleaners(text, sentence, language, text_tokenizers):
+    if language == "zh":
+        return chinese_to_ipa(text, sentence, text_tokenizers["zh"])
+    elif language == "ja":
+        return japanese_to_ipa(text, text_tokenizers["ja"])
+    elif language == "en":
+        return english_to_ipa(text, text_tokenizers["en"])
+    elif language == "fr":
+        return french_to_ipa(text, text_tokenizers["fr"])
+    elif language == "ko":
+        return korean_to_ipa(text, text_tokenizers["ko"])
+    elif language == "de":
+        return german_to_ipa(text, text_tokenizers["de"])
+    else:
+        raise Exception("Unknown language: %s" % language)
+        return None

src/YingMusicSinger/utils/f5_tts/g2p/g2p/english.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+import inflect
+"""
+    Text clean time
+"""
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_percent_number_re = re.compile(r"([0-9\.\,]*[0-9]+%)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_fraction_re = re.compile(r"([0-9]+)/([0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+        ("etc", "et cetera"),
+        ("btw", "by the way"),
+    ]
+]
+_special_map = [
+    ("t|ɹ", "tɹ"),
+    ("d|ɹ", "dɹ"),
+    ("t|s", "ts"),
+    ("d|z", "dz"),
+    ("ɪ|ɹ", "ɪɹ"),
+    ("ɐ", "ɚ"),
+    ("ᵻ", "ɪ"),
+    ("əl", "l"),
+    ("x", "k"),
+    ("ɬ", "l"),
+    ("ʔ", "t"),
+    ("n̩", "n"),
+    ("oː|ɹ", "oːɹ"),
+]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+def _expand_percent(m):
+    return m.group(1).replace("%", " percent ")
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return " " + match + " dollars "  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return " %s %s, %s %s " % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return " %s %s " % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return " %s %s " % (cents, cent_unit)
+    else:
+        return " zero dollars "
+def fraction_to_words(numerator, denominator):
+    if numerator == 1 and denominator == 2:
+        return " one half "
+    if numerator == 1 and denominator == 4:
+        return " one quarter "
+    if denominator == 2:
+        return " " + _inflect.number_to_words(numerator) + " halves "
+    if denominator == 4:
+        return " " + _inflect.number_to_words(numerator) + " quarters "
+    return (
+        " "
+        + _inflect.number_to_words(numerator)
+        + " "
+        + _inflect.ordinal(_inflect.number_to_words(denominator))
+        + " "
+    )
+def _expand_fraction(m):
+    numerator = int(m.group(1))
+    denominator = int(m.group(2))
+    return fraction_to_words(numerator, denominator)
+def _expand_ordinal(m):
+    return " " + _inflect.number_to_words(m.group(0)) + " "
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return " two thousand "
+        elif num > 2000 and num < 2010:
+            return " two thousand " + _inflect.number_to_words(num % 100) + " "
+        elif num % 100 == 0:
+            return " " + _inflect.number_to_words(num // 100) + " hundred "
+        else:
+            return (
+                " "
+                + _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(
+                    ", ", " "
+                )
+                + " "
+            )
+    else:
+        return " " + _inflect.number_to_words(num, andword="") + " "
+# Normalize numbers pronunciation
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_fraction_re, _expand_fraction, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_percent_number_re, _expand_percent, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
+def _english_to_ipa(text):
+    # text = unidecode(text).lower()
+    text = expand_abbreviations(text)
+    text = normalize_numbers(text)
+    return text
+# special map
+def special_map(text):
+    for regex, replacement in _special_map:
+        regex = regex.replace("|", "\|")
+        while re.search(r"(^|[_|]){}([_|]|$)".format(regex), text):
+            text = re.sub(
+                r"(^|[_|]){}([_|]|$)".format(regex), r"\1{}\2".format(replacement), text
+            )
+    # text = re.sub(r'([,.!?])', r'|\1', text)
+    return text
+# Add some special operation
+def english_to_ipa(text, text_tokenizer):
+    if type(text) == str:
+        text = _english_to_ipa(text)
+    else:
+        text = [_english_to_ipa(t) for t in text]
+    phonemes = text_tokenizer(text)
+    if phonemes[-1] in "p⁼ʰmftnlkxʃs`ɹaoəɛɪeɑʊŋiuɥwæjː":
+        phonemes += "|_"
+    if type(text) == str:
+        return special_map(phonemes)
+    else:
+        result_ph = []
+        for phone in phonemes:
+            result_ph.append(special_map(phone))
+        return result_ph

src/YingMusicSinger/utils/f5_tts/g2p/g2p/french.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+"""
+    Text clean time
+"""
+# List of (regular expression, replacement) pairs for abbreviations in french:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("M", "monsieur"),
+        ("Mlle", "mademoiselle"),
+        ("Mlles", "mesdemoiselles"),
+        ("Mme", "Madame"),
+        ("Mmes", "Mesdames"),
+        ("N.B", "nota bene"),
+        ("M", "monsieur"),
+        ("p.c.q", "parce que"),
+        ("Pr", "professeur"),
+        ("qqch", "quelque chose"),
+        ("rdv", "rendez-vous"),
+        ("max", "maximum"),
+        ("min", "minimum"),
+        ("no", "numéro"),
+        ("adr", "adresse"),
+        ("dr", "docteur"),
+        ("st", "saint"),
+        ("co", "companie"),
+        ("jr", "junior"),
+        ("sgt", "sergent"),
+        ("capt", "capitain"),
+        ("col", "colonel"),
+        ("av", "avenue"),
+        ("av. J.-C", "avant Jésus-Christ"),
+        ("apr. J.-C", "après Jésus-Christ"),
+        ("art", "article"),
+        ("boul", "boulevard"),
+        ("c.-à-d", "c’est-à-dire"),
+        ("etc", "et cetera"),
+        ("ex", "exemple"),
+        ("excl", "exclusivement"),
+        ("boul", "boulevard"),
+    ]
+] + [
+    (re.compile("\\b%s" % x[0]), x[1])
+    for x in [
+        ("Mlle", "mademoiselle"),
+        ("Mlles", "mesdemoiselles"),
+        ("Mme", "Madame"),
+        ("Mmes", "Mesdames"),
+    ]
+]
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": ".",
+    "…": ".",
+    "$": ".",
+    "“": "",
+    "”": "",
+    "‘": "",
+    "’": "",
+    "（": "",
+    "）": "",
+    "(": "",
+    ")": "",
+    "《": "",
+    "》": "",
+    "【": "",
+    "】": "",
+    "[": "",
+    "]": "",
+    "—": "",
+    "～": "-",
+    "~": "-",
+    "「": "",
+    "」": "",
+    "¿": "",
+    "¡": "",
+}
+def collapse_whitespace(text):
+    # Regular expression matching whitespace:
+    _whitespace_re = re.compile(r"\s+")
+    return re.sub(_whitespace_re, " ", text).strip()
+def remove_punctuation_at_begin(text):
+    return re.sub(r"^[,.!?]+", "", text)
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
+    return text
+def replace_symbols(text):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ")
+    text = text.replace(":", ",")
+    text = text.replace("&", " et ")
+    return text
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def replace_punctuation(text):
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+    return replaced_text
+def text_normalize(text):
+    text = expand_abbreviations(text)
+    text = replace_punctuation(text)
+    text = replace_symbols(text)
+    text = remove_aux_symbols(text)
+    text = remove_punctuation_at_begin(text)
+    text = collapse_whitespace(text)
+    text = re.sub(r"([^\.,!\?\-…])$", r"\1", text)
+    return text
+def french_to_ipa(text, text_tokenizer):
+    if type(text) == str:
+        text = text_normalize(text)
+        phonemes = text_tokenizer(text)
+        return phonemes
+    else:
+        for i, t in enumerate(text):
+            text[i] = text_normalize(t)
+        return text_tokenizer(text)

src/YingMusicSinger/utils/f5_tts/g2p/g2p/german.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+"""
+    Text clean time
+"""
+rep_map = {
+    "：": ",",
+    "；": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": ".",
+    "·": ",",
+    "、": ",",
+    "...": ".",
+    "…": ".",
+    "$": ".",
+    "“": "",
+    "”": "",
+    "‘": "",
+    "’": "",
+    "（": "",
+    "）": "",
+    "(": "",
+    ")": "",
+    "《": "",
+    "》": "",
+    "【": "",
+    "】": "",
+    "[": "",
+    "]": "",
+    "—": "",
+    "～": "-",
+    "~": "-",
+    "「": "",
+    "」": "",
+    "¿": "",
+    "¡": "",
+}
+def collapse_whitespace(text):
+    # Regular expression matching whitespace:
+    _whitespace_re = re.compile(r"\s+")
+    return re.sub(_whitespace_re, " ", text).strip()
+def remove_punctuation_at_begin(text):
+    return re.sub(r"^[,.!?]+", "", text)
+def remove_aux_symbols(text):
+    text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
+    return text
+def replace_symbols(text):
+    text = text.replace(";", ",")
+    text = text.replace("-", " ")
+    text = text.replace(":", ",")
+    return text
+def replace_punctuation(text):
+    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
+    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
+    return replaced_text
+def text_normalize(text):
+    text = replace_punctuation(text)
+    text = replace_symbols(text)
+    text = remove_aux_symbols(text)
+    text = remove_punctuation_at_begin(text)
+    text = collapse_whitespace(text)
+    text = re.sub(r"([^\.,!\?\-…])$", r"\1", text)
+    return text
+def german_to_ipa(text, text_tokenizer):
+    if type(text) == str:
+        text = text_normalize(text)
+        phonemes = text_tokenizer(text)
+        return phonemes
+    else:
+        for i, t in enumerate(text):
+            text[i] = text_normalize(t)
+        return text_tokenizer(text)

src/YingMusicSinger/utils/f5_tts/g2p/g2p/korean.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+"""
+    Text clean time
+"""
+english_dictionary = {
+    "KOREA": "코리아",
+    "IDOL": "아이돌",
+    "IT": "아이티",
+    "IQ": "아이큐",
+    "UP": "업",
+    "DOWN": "다운",
+    "PC": "피씨",
+    "CCTV": "씨씨티비",
+    "SNS": "에스엔에스",
+    "AI": "에이아이",
+    "CEO": "씨이오",
+    "A": "에이",
+    "B": "비",
+    "C": "씨",
+    "D": "디",
+    "E": "이",
+    "F": "에프",
+    "G": "지",
+    "H": "에이치",
+    "I": "아이",
+    "J": "제이",
+    "K": "케이",
+    "L": "엘",
+    "M": "엠",
+    "N": "엔",
+    "O": "오",
+    "P": "피",
+    "Q": "큐",
+    "R": "알",
+    "S": "에스",
+    "T": "티",
+    "U": "유",
+    "V": "브이",
+    "W": "더블유",
+    "X": "엑스",
+    "Y": "와이",
+    "Z": "제트",
+}
+def normalize(text):
+    text = text.strip()
+    text = re.sub(
+        "[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text
+    )
+    text = normalize_english(text)
+    text = text.lower()
+    return text
+def normalize_english(text):
+    def fn(m):
+        word = m.group()
+        if word in english_dictionary:
+            return english_dictionary.get(word)
+        return word
+    text = re.sub("([A-Za-z]+)", fn, text)
+    return text
+def korean_to_ipa(text, text_tokenizer):
+    if type(text) == str:
+        text = normalize(text)
+        phonemes = text_tokenizer(text)
+        return phonemes
+    else:
+        for i, t in enumerate(text):
+            text[i] = normalize(t)
+        return text_tokenizer(text)

src/YingMusicSinger/utils/f5_tts/g2p/g2p/mandarin.py ADDED Viewed

	@@ -0,0 +1,603 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import re
+from typing import List
+import cn2an
+import jieba
+from pypinyin import BOPOMOFO, lazy_pinyin
+from src.YingMusicSinger.utils.f5_tts.g2p.g2p.chinese_model_g2p import BertPolyPredict
+from src.YingMusicSinger.utils.f5_tts.g2p.utils.front_utils import *
+# from g2pw import G2PWConverter
+# set blank level, {0："none",1:"char", 2:"word"}
+BLANK_LEVEL = 0
+# conv = G2PWConverter(style='pinyin', enable_non_tradional_chinese=True)
+resource_path = r"./src/YingMusicSinger/utils/f5_tts/g2p"
+poly_all_class_path = os.path.join(
+    resource_path, "sources", "g2p_chinese_model", "polychar.txt"
+)
+if not os.path.exists(poly_all_class_path):
+    print(
+        "Incorrect path for polyphonic character class dictionary: {}, please check...".format(
+            poly_all_class_path
+        )
+    )
+    exit()
+poly_dict = generate_poly_lexicon(poly_all_class_path)
+# Set up G2PW model parameters
+g2pw_poly_model_path = os.path.join(resource_path, "sources", "g2p_chinese_model")
+if not os.path.exists(g2pw_poly_model_path):
+    print(
+        "Incorrect path for g2pw polyphonic character model: {}, please check...".format(
+            g2pw_poly_model_path
+        )
+    )
+    exit()
+json_file_path = os.path.join(
+    resource_path, "sources", "g2p_chinese_model", "polydict.json"
+)
+if not os.path.exists(json_file_path):
+    print(
+        "Incorrect path for g2pw id to pinyin dictionary: {}, please check...".format(
+            json_file_path
+        )
+    )
+    exit()
+jsonr_file_path = os.path.join(
+    resource_path, "sources", "g2p_chinese_model", "polydict_r.json"
+)
+if not os.path.exists(jsonr_file_path):
+    print(
+        "Incorrect path for g2pw pinyin to id dictionary: {}, please check...".format(
+            jsonr_file_path
+        )
+    )
+    exit()
+g2pw_poly_predict = BertPolyPredict(
+    g2pw_poly_model_path, jsonr_file_path, json_file_path
+)
+"""
+    Text clean time
+"""
+# List of (Latin alphabet, bopomofo) pairs:
+_latin_to_bopomofo = [
+    (re.compile("%s" % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("a", "ㄟˉ"),
+        ("b", "ㄅㄧˋ"),
+        ("c", "ㄙㄧˉ"),
+        ("d", "ㄉㄧˋ"),
+        ("e", "ㄧˋ"),
+        ("f", "ㄝˊㄈㄨˋ"),
+        ("g", "ㄐㄧˋ"),
+        ("h", "ㄝˇㄑㄩˋ"),
+        ("i", "ㄞˋ"),
+        ("j", "ㄐㄟˋ"),
+        ("k", "ㄎㄟˋ"),
+        ("l", "ㄝˊㄛˋ"),
+        ("m", "ㄝˊㄇㄨˋ"),
+        ("n", "ㄣˉ"),
+        ("o", "ㄡˉ"),
+        ("p", "ㄆㄧˉ"),
+        ("q", "ㄎㄧㄡˉ"),
+        ("r", "ㄚˋ"),
+        ("s", "ㄝˊㄙˋ"),
+        ("t", "ㄊㄧˋ"),
+        ("u", "ㄧㄡˉ"),
+        ("v", "ㄨㄧˉ"),
+        ("w", "ㄉㄚˋㄅㄨˋㄌㄧㄡˋ"),
+        ("x", "ㄝˉㄎㄨˋㄙˋ"),
+        ("y", "ㄨㄞˋ"),
+        ("z", "ㄗㄟˋ"),
+    ]
+]
+# List of (bopomofo, ipa) pairs:
+_bopomofo_to_ipa = [
+    (re.compile("%s" % x[0]), x[1])
+    for x in [
+        ("ㄅㄛ", "p⁼wo"),
+        ("ㄆㄛ", "pʰwo"),
+        ("ㄇㄛ", "mwo"),
+        ("ㄈㄛ", "fwo"),
+        ("ㄧㄢ", "|jɛn"),
+        ("ㄩㄢ", "|ɥæn"),
+        ("ㄧㄣ", "|in"),
+        ("ㄩㄣ", "|ɥn"),
+        ("ㄧㄥ", "|iŋ"),
+        ("ㄨㄥ", "|ʊŋ"),
+        ("ㄩㄥ", "|jʊŋ"),
+        # Add
+        ("ㄧㄚ", "|ia"),
+        ("ㄧㄝ", "|iɛ"),
+        ("ㄧㄠ", "|iɑʊ"),
+        ("ㄧㄡ", "|ioʊ"),
+        ("ㄧㄤ", "|iɑŋ"),
+        ("ㄨㄚ", "|ua"),
+        ("ㄨㄛ", "|uo"),
+        ("ㄨㄞ", "|uaɪ"),
+        ("ㄨㄟ", "|ueɪ"),
+        ("ㄨㄢ", "|uan"),
+        ("ㄨㄣ", "|uən"),
+        ("ㄨㄤ", "|uɑŋ"),
+        ("ㄩㄝ", "|ɥɛ"),
+        # End
+        ("ㄅ", "p⁼"),
+        ("ㄆ", "pʰ"),
+        ("ㄇ", "m"),
+        ("ㄈ", "f"),
+        ("ㄉ", "t⁼"),
+        ("ㄊ", "tʰ"),
+        ("ㄋ", "n"),
+        ("ㄌ", "l"),
+        ("ㄍ", "k⁼"),
+        ("ㄎ", "kʰ"),
+        ("ㄏ", "x"),
+        ("ㄐ", "tʃ⁼"),
+        ("ㄑ", "tʃʰ"),
+        ("ㄒ", "ʃ"),
+        ("ㄓ", "ts`⁼"),
+        ("ㄔ", "ts`ʰ"),
+        ("ㄕ", "s`"),
+        ("ㄖ", "ɹ`"),
+        ("ㄗ", "ts⁼"),
+        ("ㄘ", "tsʰ"),
+        ("ㄙ", "|s"),
+        ("ㄚ", "|a"),
+        ("ㄛ", "|o"),
+        ("ㄜ", "|ə"),
+        ("ㄝ", "|ɛ"),
+        ("ㄞ", "|aɪ"),
+        ("ㄟ", "|eɪ"),
+        ("ㄠ", "|ɑʊ"),
+        ("ㄡ", "|oʊ"),
+        ("ㄢ", "|an"),
+        ("ㄣ", "|ən"),
+        ("ㄤ", "|ɑŋ"),
+        ("ㄥ", "|əŋ"),
+        ("ㄦ", "əɹ"),
+        ("ㄧ", "|i"),
+        ("ㄨ", "|u"),
+        ("ㄩ", "|ɥ"),
+        ("ˉ", "→|"),
+        ("ˊ", "↑|"),
+        ("ˇ", "↓↑|"),
+        ("ˋ", "↓|"),
+        ("˙", "|"),
+    ]
+]
+must_not_er_words = {"女儿", "老儿", "男儿", "少儿", "小儿"}
+word_pinyin_dict = {}
+with open(
+    r"src/YingMusicSinger/utils/f5_tts/g2p/sources/chinese_lexicon.txt",
+    "r",
+    encoding="utf-8",
+) as fread:
+    txt_list = fread.readlines()
+    for txt in txt_list:
+        word, pinyin = txt.strip().split("\t")
+        word_pinyin_dict[word] = pinyin
+    fread.close()
+pinyin_2_bopomofo_dict = {}
+with open(
+    r"./src/YingMusicSinger/utils/f5_tts/g2p/sources/pinyin_2_bpmf.txt",
+    "r",
+    encoding="utf-8",
+) as fread:
+    txt_list = fread.readlines()
+    for txt in txt_list:
+        pinyin, bopomofo = txt.strip().split("\t")
+        pinyin_2_bopomofo_dict[pinyin] = bopomofo
+    fread.close()
+tone_dict = {
+    "0": "˙",
+    "5": "˙",
+    "1": "",
+    "2": "ˊ",
+    "3": "ˇ",
+    "4": "ˋ",
+}
+bopomofos2pinyin_dict = {}
+with open(
+    r"./src/YingMusicSinger/utils/f5_tts/g2p/sources/bpmf_2_pinyin.txt",
+    "r",
+    encoding="utf-8",
+) as fread:
+    txt_list = fread.readlines()
+    for txt in txt_list:
+        v, k = txt.strip().split("\t")
+        bopomofos2pinyin_dict[k] = v
+    fread.close()
+def bpmf_to_pinyin(text):
+    bopomofo_list = text.split("|")
+    pinyin_list = []
+    for info in bopomofo_list:
+        pinyin = ""
+        for c in info:
+            if c in bopomofos2pinyin_dict:
+                pinyin += bopomofos2pinyin_dict[c]
+        if len(pinyin) == 0:
+            continue
+        if pinyin[-1] not in "01234":
+            pinyin += "1"
+        if pinyin[:-1] == "ve":
+            pinyin = "y" + pinyin
+        if pinyin[:-1] == "sh":
+            pinyin = pinyin[:-1] + "i" + pinyin[-1]
+        if pinyin == "sh":
+            pinyin = pinyin[:-1] + "i"
+        if pinyin[:-1] == "s":
+            pinyin = "si" + pinyin[-1]
+        if pinyin[:-1] == "c":
+            pinyin = "ci" + pinyin[-1]
+        if pinyin[:-1] == "i":
+            pinyin = "yi" + pinyin[-1]
+        if pinyin[:-1] == "iou":
+            pinyin = "you" + pinyin[-1]
+        if pinyin[:-1] == "ien":
+            pinyin = "yin" + pinyin[-1]
+        if "iou" in pinyin and pinyin[-4:-1] == "iou":
+            pinyin = pinyin[:-4] + "iu" + pinyin[-1]
+        if "uei" in pinyin:
+            if pinyin[:-1] == "uei":
+                pinyin = "wei" + pinyin[-1]
+            elif pinyin[-4:-1] == "uei":
+                pinyin = pinyin[:-4] + "ui" + pinyin[-1]
+        if "uen" in pinyin and pinyin[-4:-1] == "uen":
+            if pinyin[:-1] == "uen":
+                pinyin = "wen" + pinyin[-1]
+            elif pinyin[-4:-1] == "uei":
+                pinyin = pinyin[:-4] + "un" + pinyin[-1]
+        if "van" in pinyin and pinyin[-4:-1] == "van":
+            if pinyin[:-1] == "van":
+                pinyin = "yuan" + pinyin[-1]
+            elif pinyin[-4:-1] == "van":
+                pinyin = pinyin[:-4] + "uan" + pinyin[-1]
+        if "ueng" in pinyin and pinyin[-5:-1] == "ueng":
+            pinyin = pinyin[:-5] + "ong" + pinyin[-1]
+        if pinyin[:-1] == "veng":
+            pinyin = "yong" + pinyin[-1]
+        if "veng" in pinyin and pinyin[-5:-1] == "veng":
+            pinyin = pinyin[:-5] + "iong" + pinyin[-1]
+        if pinyin[:-1] == "ieng":
+            pinyin = "ying" + pinyin[-1]
+        if pinyin[:-1] == "u":
+            pinyin = "wu" + pinyin[-1]
+        if pinyin[:-1] == "v":
+            pinyin = "yv" + pinyin[-1]
+        if pinyin[:-1] == "ing":
+            pinyin = "ying" + pinyin[-1]
+        if pinyin[:-1] == "z":
+            pinyin = "zi" + pinyin[-1]
+        if pinyin[:-1] == "zh":
+            pinyin = "zhi" + pinyin[-1]
+        if pinyin[0] == "u":
+            pinyin = "w" + pinyin[1:]
+        if pinyin[0] == "i":
+            pinyin = "y" + pinyin[1:]
+        pinyin = pinyin.replace("ien", "in")
+        pinyin_list.append(pinyin)
+    return " ".join(pinyin_list)
+# Convert numbers to Chinese pronunciation
+def number_to_chinese(text):
+    # numbers = re.findall(r'\d+(?:\.?\d+)?', text)
+    # for number in numbers:
+    #     text = text.replace(number, cn2an.an2cn(number), 1)
+    text = cn2an.transform(text, "an2cn")
+    return text
+def normalization(text):
+    text = text.replace("，", ",")
+    text = text.replace("。", ".")
+    text = text.replace("！", "!")
+    text = text.replace("？", "?")
+    text = text.replace("；", ";")
+    text = text.replace("：", ":")
+    text = text.replace("、", ",")
+    text = text.replace("‘", "'")
+    text = text.replace("’", "'")
+    text = text.replace("⋯", "…")
+    text = text.replace("···", "…")
+    text = text.replace("・・・", "…")
+    text = text.replace("...", "…")
+    text = re.sub(r"\s+", "", text)
+    text = re.sub(r"[^\u4e00-\u9fff\s_,\.\?!;:\'…]", "", text)
+    text = re.sub(r"\s*([,\.\?!;:\'…])\s*", r"\1", text)
+    return text
+def change_tone(bopomofo: str, tone: str) -> str:
+    if bopomofo[-1] not in "˙ˊˇˋ":
+        bopomofo = bopomofo + tone
+    else:
+        bopomofo = bopomofo[:-1] + tone
+    return bopomofo
+def er_sandhi(word: str, bopomofos: List[str]) -> List[str]:
+    if len(word) > 1 and word[-1] == "儿" and word not in must_not_er_words:
+        bopomofos[-1] = change_tone(bopomofos[-1], "˙")
+    return bopomofos
+def bu_sandhi(word: str, bopomofos: List[str]) -> List[str]:
+    valid_char = set(word)
+    if len(valid_char) == 1 and "不" in valid_char:
+        pass
+    elif word in ["不字"]:
+        pass
+    elif len(word) == 3 and word[1] == "不" and bopomofos[1][:-1] == "ㄅㄨ":
+        bopomofos[1] = bopomofos[1][:-1] + "˙"
+    else:
+        for i, char in enumerate(word):
+            if (
+                i + 1 < len(bopomofos)
+                and char == "不"
+                and i + 1 < len(word)
+                and 0 < len(bopomofos[i + 1])
+                and bopomofos[i + 1][-1] == "ˋ"
+            ):
+                bopomofos[i] = bopomofos[i][:-1] + "ˊ"
+    return bopomofos
+def yi_sandhi(word: str, bopomofos: List[str]) -> List[str]:
+    punc = "：，；。？！“”‘’':,;.?!()（）{}【】[]-~`、 "
+    if word.find("一") != -1 and any(
+        [item.isnumeric() for item in word if item != "一"]
+    ):
+        for i in range(len(word)):
+            if (
+                i == 0
+                and word[0] == "一"
+                and len(word) > 1
+                and word[1]
+                not in [
+                    "零",
+                    "一",
+                    "二",
+                    "三",
+                    "四",
+                    "五",
+                    "六",
+                    "七",
+                    "八",
+                    "九",
+                    "十",
+                ]
+            ):
+                if len(bopomofos[0]) > 0 and bopomofos[1][-1] in ["ˋ", "˙"]:
+                    bopomofos[0] = change_tone(bopomofos[0], "ˊ")
+                else:
+                    bopomofos[0] = change_tone(bopomofos[0], "ˋ")
+            elif word[i] == "一":
+                bopomofos[i] = change_tone(bopomofos[i], "")
+        return bopomofos
+    elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
+        bopomofos[1] = change_tone(bopomofos[1], "˙")
+    elif word.startswith("第一"):
+        bopomofos[1] = change_tone(bopomofos[1], "")
+    elif word.startswith("一月") or word.startswith("一日") or word.startswith("一号"):
+        bopomofos[0] = change_tone(bopomofos[0], "")
+    else:
+        for i, char in enumerate(word):
+            if char == "一" and i + 1 < len(word):
+                if (
+                    len(bopomofos) > i + 1
+                    and len(bopomofos[i + 1]) > 0
+                    and bopomofos[i + 1][-1] in {"ˋ"}
+                ):
+                    bopomofos[i] = change_tone(bopomofos[i], "ˊ")
+                else:
+                    if word[i + 1] not in punc:
+                        bopomofos[i] = change_tone(bopomofos[i], "ˋ")
+                    else:
+                        pass
+    return bopomofos
+def merge_bu(seg: List) -> List:
+    new_seg = []
+    last_word = ""
+    for word in seg:
+        if word != "不":
+            if last_word == "不":
+                word = last_word + word
+            new_seg.append(word)
+        last_word = word
+    return new_seg
+def merge_er(seg: List) -> List:
+    new_seg = []
+    for i, word in enumerate(seg):
+        if i - 1 >= 0 and word == "儿":
+            new_seg[-1] = new_seg[-1] + seg[i]
+        else:
+            new_seg.append(word)
+    return new_seg
+def merge_yi(seg: List) -> List:
+    new_seg = []
+    # function 1
+    for i, word in enumerate(seg):
+        if (
+            i - 1 >= 0
+            and word == "一"
+            and i + 1 < len(seg)
+            and seg[i - 1] == seg[i + 1]
+        ):
+            if i - 1 < len(new_seg):
+                new_seg[i - 1] = new_seg[i - 1] + "一" + new_seg[i - 1]
+            else:
+                new_seg.append(word)
+                new_seg.append(seg[i + 1])
+        else:
+            if i - 2 >= 0 and seg[i - 1] == "一" and seg[i - 2] == word:
+                continue
+            else:
+                new_seg.append(word)
+    seg = new_seg
+    new_seg = []
+    isnumeric_flag = False
+    for i, word in enumerate(seg):
+        if all([item.isnumeric() for item in word]) and not isnumeric_flag:
+            isnumeric_flag = True
+            new_seg.append(word)
+        else:
+            new_seg.append(word)
+    seg = new_seg
+    new_seg = []
+    # function 2
+    for i, word in enumerate(seg):
+        if new_seg and new_seg[-1] == "一":
+            new_seg[-1] = new_seg[-1] + word
+        else:
+            new_seg.append(word)
+    return new_seg
+# Word Segmentation, and convert Chinese pronunciation to pinyin (bopomofo)
+def chinese_to_bopomofo(text_short, sentence):
+    # bopomofos = conv(text_short)
+    words = jieba.lcut(text_short, cut_all=False)
+    words = merge_yi(words)
+    words = merge_bu(words)
+    words = merge_er(words)
+    text = ""
+    char_index = 0
+    for word in words:
+        bopomofos = []
+        if word in word_pinyin_dict and word not in poly_dict:
+            pinyin = word_pinyin_dict[word]
+            for py in pinyin.split(" "):
+                if py[:-1] in pinyin_2_bopomofo_dict and py[-1] in tone_dict:
+                    bopomofos.append(
+                        pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
+                    )
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+                else:
+                    bopomofos_lazy = lazy_pinyin(word, BOPOMOFO)
+                    bopomofos += bopomofos_lazy
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+        else:
+            for i in range(len(word)):
+                c = word[i]
+                if c in poly_dict:
+                    poly_pinyin = g2pw_poly_predict.predict_process(
+                        [text_short, char_index + i]
+                    )[0]
+                    py = poly_pinyin[2:-1]
+                    bopomofos.append(
+                        pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
+                    )
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+                elif c in word_pinyin_dict:
+                    py = word_pinyin_dict[c]
+                    bopomofos.append(
+                        pinyin_2_bopomofo_dict[py[:-1]] + tone_dict[py[-1]]
+                    )
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+                else:
+                    bopomofos.append(c)
+                    if BLANK_LEVEL == 1:
+                        bopomofos.append("_")
+        if BLANK_LEVEL == 2:
+            bopomofos.append("_")
+        char_index += len(word)
+        if (
+            len(word) == 3
+            and bopomofos[0][-1] == "ˇ"
+            and bopomofos[1][-1] == "ˇ"
+            and bopomofos[-1][-1] == "ˇ"
+        ):
+            bopomofos[0] = bopomofos[0] + "ˊ"
+            bopomofos[1] = bopomofos[1] + "ˊ"
+        if len(word) == 2 and bopomofos[0][-1] == "ˇ" and bopomofos[-1][-1] == "ˇ":
+            bopomofos[0] = bopomofos[0][:-1] + "ˊ"
+        bopomofos = bu_sandhi(word, bopomofos)
+        bopomofos = yi_sandhi(word, bopomofos)
+        bopomofos = er_sandhi(word, bopomofos)
+        if not re.search("[\u4e00-\u9fff]", word):
+            text += "|" + word
+            continue
+        for i in range(len(bopomofos)):
+            bopomofos[i] = re.sub(r"([\u3105-\u3129])$", r"\1ˉ", bopomofos[i])
+        if text != "":
+            text += "|"
+        text += "|".join(bopomofos)
+    return text
+# Convert latin pronunciation to pinyin (bopomofo)
+def latin_to_bopomofo(text):
+    for regex, replacement in _latin_to_bopomofo:
+        text = re.sub(regex, replacement, text)
+    return text
+# Convert pinyin (bopomofo) to IPA
+def bopomofo_to_ipa(text):
+    for regex, replacement in _bopomofo_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text
+def _chinese_to_ipa(text, sentence):
+    text = number_to_chinese(text.strip())
+    text = normalization(text)
+    text = chinese_to_bopomofo(text, sentence)
+    # pinyin = bpmf_to_pinyin(text)
+    text = latin_to_bopomofo(text)
+    text = bopomofo_to_ipa(text)
+    text = re.sub("([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)", r"\1ɹ\2", text)
+    text = re.sub("([s][⁼ʰ]?)([→↓↑ ]+|$)", r"\1ɹ\2", text)
+    text = re.sub(r"^\||[^\w\s_,\.\?!;:\'…\|→↓↑⁼ʰ`]", "", text)
+    text = re.sub(r"([,\.\?!;:\'…])", r"|\1|", text)
+    text = re.sub(r"\|+", "|", text)
+    text = text.rstrip("|")
+    return text
+# Convert Chinese to IPA
+def chinese_to_ipa(text, sentence, text_tokenizer):
+    # phonemes = text_tokenizer(text.strip())
+    if type(text) == str:
+        return _chinese_to_ipa(text, sentence)
+    else:
+        result_ph = []
+        for t in text:
+            result_ph.append(_chinese_to_ipa(t, sentence))
+        return result_ph

src/YingMusicSinger/utils/f5_tts/g2p/g2p/text_tokenizers.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from typing import List, Union
+from phonemizer.backend import EspeakBackend
+from phonemizer.backend.espeak.language_switch import LanguageSwitch
+from phonemizer.backend.espeak.words_mismatch import WordMismatch
+from phonemizer.separator import Separator
+from phonemizer.utils import list2str, str2list
+class TextTokenizer:
+    """Phonemize Text."""
+    def __init__(
+        self,
+        language="en-us",
+        backend="espeak",
+        separator=Separator(word="|_|", syllable="-", phone="|"),
+        preserve_punctuation=True,
+        with_stress: bool = False,
+        tie: Union[bool, str] = False,
+        language_switch: LanguageSwitch = "remove-flags",
+        words_mismatch: WordMismatch = "ignore",
+    ) -> None:
+        self.preserve_punctuation_marks = ",.?!;:'…"
+        self.backend = EspeakBackend(
+            language,
+            punctuation_marks=self.preserve_punctuation_marks,
+            preserve_punctuation=preserve_punctuation,
+            with_stress=with_stress,
+            tie=tie,
+            language_switch=language_switch,
+            words_mismatch=words_mismatch,
+        )
+        self.separator = separator
+    # convert chinese punctuation to english punctuation
+    def convert_chinese_punctuation(self, text: str) -> str:
+        text = text.replace("，", ",")
+        text = text.replace("。", ".")
+        text = text.replace("！", "!")
+        text = text.replace("？", "?")
+        text = text.replace("；", ";")
+        text = text.replace("：", ":")
+        text = text.replace("、", ",")
+        text = text.replace("‘", "'")
+        text = text.replace("’", "'")
+        text = text.replace("⋯", "…")
+        text = text.replace("···", "…")
+        text = text.replace("・・・", "…")
+        text = text.replace("...", "…")
+        return text
+    def __call__(self, text, strip=True) -> List[str]:
+        text_type = type(text)
+        normalized_text = []
+        for line in str2list(text):
+            line = self.convert_chinese_punctuation(line.strip())
+            line = re.sub(r"[^\w\s_,\.\?!;:\'…]", "", line)
+            line = re.sub(r"\s*([,\.\?!;:\'…])\s*", r"\1", line)
+            line = re.sub(r"\s+", " ", line)
+            normalized_text.append(line)
+        # print("Normalized test: ", normalized_text[0])
+        phonemized = self.backend.phonemize(
+            normalized_text, separator=self.separator, strip=strip, njobs=1
+        )
+        if text_type == str:
+            phonemized = re.sub(r"([,\.\?!;:\'…])", r"|\1|", list2str(phonemized))
+            phonemized = re.sub(r"\|+", "|", phonemized)
+            phonemized = phonemized.rstrip("|")
+        else:
+            for i in range(len(phonemized)):
+                phonemized[i] = re.sub(r"([,\.\?!;:\'…])", r"|\1|", phonemized[i])
+                phonemized[i] = re.sub(r"\|+", "|", phonemized[i])
+                phonemized[i] = phonemized[i].rstrip("|")
+        return phonemized

src/YingMusicSinger/utils/f5_tts/g2p/g2p/vocab.json ADDED Viewed

	@@ -0,0 +1,372 @@

+{
+	"vocab": {
+		",": 0,
+		".": 1,
+		"?": 2,
+		"!": 3,
+		"_": 4,
+		"iː": 5,
+		"ɪ": 6,
+		"ɜː": 7,
+		"ɚ": 8,
+		"oːɹ": 9,
+		"ɔː": 10,
+		"ɔːɹ": 11,
+		"ɑː": 12,
+		"uː": 13,
+		"ʊ": 14,
+		"ɑːɹ": 15,
+		"ʌ": 16,
+		"ɛ": 17,
+		"æ": 18,
+		"eɪ": 19,
+		"aɪ": 20,
+		"ɔɪ": 21,
+		"aʊ": 22,
+		"oʊ": 23,
+		"ɪɹ": 24,
+		"ɛɹ": 25,
+		"ʊɹ": 26,
+		"p": 27,
+		"b": 28,
+		"t": 29,
+		"d": 30,
+		"k": 31,
+		"ɡ": 32,
+		"f": 33,
+		"v": 34,
+		"θ": 35,
+		"ð": 36,
+		"s": 37,
+		"z": 38,
+		"ʃ": 39,
+		"ʒ": 40,
+		"h": 41,
+		"tʃ": 42,
+		"dʒ": 43,
+		"m": 44,
+		"n": 45,
+		"ŋ": 46,
+		"j": 47,
+		"w": 48,
+		"ɹ": 49,
+		"l": 50,
+		"tɹ": 51,
+		"dɹ": 52,
+		"ts": 53,
+		"dz": 54,
+		"i": 55,
+		"ɔ": 56,
+		"ə": 57,
+		"ɾ": 58,
+		"iə": 59,
+		"r": 60,
+		"u": 61,
+		"oː": 62,
+		"ɛː": 63,
+		"ɪː": 64,
+		"aɪə": 65,
+		"aɪɚ": 66,
+		"ɑ̃": 67,
+		"ç": 68,
+		"ɔ̃": 69,
+		"ææ": 70,
+		"ɐɐ": 71,
+		"ɡʲ": 72,
+		"nʲ": 73,
+		"iːː": 74,
+		"p⁼": 75,
+		"pʰ": 76,
+		"t⁼": 77,
+		"tʰ": 78,
+		"k⁼": 79,
+		"kʰ": 80,
+		"x": 81,
+		"tʃ⁼": 82,
+		"tʃʰ": 83,
+		"ts`⁼": 84,
+		"ts`ʰ": 85,
+		"s`": 86,
+		"ɹ`": 87,
+		"ts⁼": 88,
+		"tsʰ": 89,
+		"p⁼wo": 90,
+		"p⁼wo→": 91,
+		"p⁼wo↑": 92,
+		"p⁼wo↓↑": 93,
+		"p⁼wo↓": 94,
+		"pʰwo": 95,
+		"pʰwo→": 96,
+		"pʰwo↑": 97,
+		"pʰwo↓↑": 98,
+		"pʰwo↓": 99,
+		"mwo": 100,
+		"mwo→": 101,
+		"mwo↑": 102,
+		"mwo↓↑": 103,
+		"mwo↓": 104,
+		"fwo": 105,
+		"fwo→": 106,
+		"fwo↑": 107,
+		"fwo↓↑": 108,
+		"fwo↓": 109,
+		"jɛn": 110,
+		"jɛn→": 111,
+		"jɛn↑": 112,
+		"jɛn↓↑": 113,
+		"jɛn↓": 114,
+		"ɥæn": 115,
+		"ɥæn→": 116,
+		"ɥæn↑": 117,
+		"ɥæn↓↑": 118,
+		"ɥæn↓": 119,
+		"in": 120,
+		"in→": 121,
+		"in↑": 122,
+		"in↓↑": 123,
+		"in↓": 124,
+		"ɥn": 125,
+		"ɥn→": 126,
+		"ɥn↑": 127,
+		"ɥn↓↑": 128,
+		"ɥn↓": 129,
+		"iŋ": 130,
+		"iŋ→": 131,
+		"iŋ↑": 132,
+		"iŋ↓↑": 133,
+		"iŋ↓": 134,
+		"ʊŋ": 135,
+		"ʊŋ→": 136,
+		"ʊŋ↑": 137,
+		"ʊŋ↓↑": 138,
+		"ʊŋ↓": 139,
+		"jʊŋ": 140,
+		"jʊŋ→": 141,
+		"jʊŋ↑": 142,
+		"jʊŋ↓↑": 143,
+		"jʊŋ↓": 144,
+		"ia": 145,
+		"ia→": 146,
+		"ia↑": 147,
+		"ia↓↑": 148,
+		"ia↓": 149,
+		"iɛ": 150,
+		"iɛ→": 151,
+		"iɛ↑": 152,
+		"iɛ↓↑": 153,
+		"iɛ↓": 154,
+		"iɑʊ": 155,
+		"iɑʊ→": 156,
+		"iɑʊ↑": 157,
+		"iɑʊ↓↑": 158,
+		"iɑʊ↓": 159,
+		"ioʊ": 160,
+		"ioʊ→": 161,
+		"ioʊ↑": 162,
+		"ioʊ↓↑": 163,
+		"ioʊ↓": 164,
+		"iɑŋ": 165,
+		"iɑŋ→": 166,
+		"iɑŋ↑": 167,
+		"iɑŋ↓↑": 168,
+		"iɑŋ↓": 169,
+		"ua": 170,
+		"ua→": 171,
+		"ua↑": 172,
+		"ua↓↑": 173,
+		"ua↓": 174,
+		"uo": 175,
+		"uo→": 176,
+		"uo↑": 177,
+		"uo↓↑": 178,
+		"uo↓": 179,
+		"uaɪ": 180,
+		"uaɪ→": 181,
+		"uaɪ↑": 182,
+		"uaɪ↓↑": 183,
+		"uaɪ↓": 184,
+		"ueɪ": 185,
+		"ueɪ→": 186,
+		"ueɪ↑": 187,
+		"ueɪ↓↑": 188,
+		"ueɪ↓": 189,
+		"uan": 190,
+		"uan→": 191,
+		"uan↑": 192,
+		"uan↓↑": 193,
+		"uan↓": 194,
+		"uən": 195,
+		"uən→": 196,
+		"uən↑": 197,
+		"uən↓↑": 198,
+		"uən↓": 199,
+		"uɑŋ": 200,
+		"uɑŋ→": 201,
+		"uɑŋ↑": 202,
+		"uɑŋ↓↑": 203,
+		"uɑŋ↓": 204,
+		"ɥɛ": 205,
+		"ɥɛ→": 206,
+		"ɥɛ↑": 207,
+		"ɥɛ↓↑": 208,
+		"ɥɛ↓": 209,
+		"a": 210,
+		"a→": 211,
+		"a↑": 212,
+		"a↓↑": 213,
+		"a↓": 214,
+		"o": 215,
+		"o→": 216,
+		"o↑": 217,
+		"o↓↑": 218,
+		"o↓": 219,
+		"ə→": 220,
+		"ə↑": 221,
+		"ə↓↑": 222,
+		"ə↓": 223,
+		"ɛ→": 224,
+		"ɛ↑": 225,
+		"ɛ↓↑": 226,
+		"ɛ↓": 227,
+		"aɪ→": 228,
+		"aɪ↑": 229,
+		"aɪ↓↑": 230,
+		"aɪ↓": 231,
+		"eɪ→": 232,
+		"eɪ↑": 233,
+		"eɪ↓↑": 234,
+		"eɪ↓": 235,
+		"ɑʊ": 236,
+		"ɑʊ→": 237,
+		"ɑʊ↑": 238,
+		"ɑʊ↓↑": 239,
+		"ɑʊ↓": 240,
+		"oʊ→": 241,
+		"oʊ↑": 242,
+		"oʊ↓↑": 243,
+		"oʊ↓": 244,
+		"an": 245,
+		"an→": 246,
+		"an↑": 247,
+		"an↓↑": 248,
+		"an↓": 249,
+		"ən": 250,
+		"ən→": 251,
+		"ən↑": 252,
+		"ən↓↑": 253,
+		"ən↓": 254,
+		"ɑŋ": 255,
+		"ɑŋ→": 256,
+		"ɑŋ↑": 257,
+		"ɑŋ↓↑": 258,
+		"ɑŋ↓": 259,
+		"əŋ": 260,
+		"əŋ→": 261,
+		"əŋ↑": 262,
+		"əŋ↓↑": 263,
+		"əŋ↓": 264,
+		"əɹ": 265,
+		"əɹ→": 266,
+		"əɹ↑": 267,
+		"əɹ↓↑": 268,
+		"əɹ↓": 269,
+		"i→": 270,
+		"i↑": 271,
+		"i↓↑": 272,
+		"i↓": 273,
+		"u→": 274,
+		"u↑": 275,
+		"u↓↑": 276,
+		"u↓": 277,
+		"ɥ": 278,
+		"ɥ→": 279,
+		"ɥ↑": 280,
+		"ɥ↓↑": 281,
+		"ɥ↓": 282,
+		"ts`⁼ɹ": 283,
+		"ts`⁼ɹ→": 284,
+		"ts`⁼ɹ↑": 285,
+		"ts`⁼ɹ↓↑": 286,
+		"ts`⁼ɹ↓": 287,
+		"ts`ʰɹ": 288,
+		"ts`ʰɹ→": 289,
+		"ts`ʰɹ↑": 290,
+		"ts`ʰɹ↓↑": 291,
+		"ts`ʰɹ↓": 292,
+		"s`ɹ": 293,
+		"s`ɹ→": 294,
+		"s`ɹ↑": 295,
+		"s`ɹ↓↑": 296,
+		"s`ɹ���": 297,
+		"ɹ`ɹ": 298,
+		"ɹ`ɹ→": 299,
+		"ɹ`ɹ↑": 300,
+		"ɹ`ɹ↓↑": 301,
+		"ɹ`ɹ↓": 302,
+		"ts⁼ɹ": 303,
+		"ts⁼ɹ→": 304,
+		"ts⁼ɹ↑": 305,
+		"ts⁼ɹ↓↑": 306,
+		"ts⁼ɹ↓": 307,
+		"tsʰɹ": 308,
+		"tsʰɹ→": 309,
+		"tsʰɹ↑": 310,
+		"tsʰɹ↓↑": 311,
+		"tsʰɹ↓": 312,
+		"sɹ": 313,
+		"sɹ→": 314,
+		"sɹ↑": 315,
+		"sɹ↓↑": 316,
+		"sɹ↓": 317,
+		"ɯ": 318,
+		"e": 319,
+		"aː": 320,
+		"ɯː": 321,
+		"eː": 322,
+		"ç": 323,
+		"ɸ": 324,
+		"ɰᵝ": 325,
+		"ɴ": 326,
+		"g": 327,
+		"dʑ": 328,
+		"q": 329,
+		"ː": 330,
+		"bj": 331,
+		"tɕ": 332,
+		"dej": 333,
+		"tej": 334,
+		"gj": 335,
+		"gɯ": 336,
+		"çj": 337,
+		"kj": 338,
+		"kɯ": 339,
+		"mj": 340,
+		"nj": 341,
+		"pj": 342,
+		"ɾj": 343,
+		"ɕ": 344,
+		"tsɯ": 345,
+		"ɐ": 346,
+		"ɑ": 347,
+		"ɒ": 348,
+		"ɜ": 349,
+		"ɫ": 350,
+		"ʑ": 351,
+		"ʲ": 352,
+		"y": 353,
+		"ø": 354,
+		"œ": 355,
+		"ʁ": 356,
+		"̃": 357,
+		"ɲ": 358,
+		":": 359,
+		";": 360,
+		"'": 361,
+		"…": 362
+	}
+}

src/YingMusicSinger/utils/f5_tts/g2p/g2p_generation.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+from typing import List
+from src.YingMusicSinger.utils.f5_tts.g2p.g2p import PhonemeBpeTokenizer
+from src.YingMusicSinger.utils.f5_tts.g2p.utils.g2p import phonemizer_g2p
+def ph_g2p(text, language):
+    return phonemizer_g2p(text=text, language=language)
+def g2p(text, sentence, language):
+    return text_tokenizer.tokenize(text=text, sentence=sentence, language=language)
+def is_chinese(char):
+    if char >= "\u4e00" and char <= "\u9fa5":
+        return True
+    else:
+        return False
+def is_alphabet(char):
+    if (char >= "\u0041" and char <= "\u005a") or (
+        char >= "\u0061" and char <= "\u007a"
+    ):
+        return True
+    else:
+        return False
+def is_other(char):
+    if not (is_chinese(char) or is_alphabet(char)):
+        return True
+    else:
+        return False
+def get_segment(text: str) -> List[str]:
+    # sentence --> [ch_part, en_part, ch_part, ...]
+    segments = []
+    types = []
+    flag = 0
+    temp_seg = ""
+    temp_lang = ""
+    # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
+    for i, ch in enumerate(text):
+        if is_chinese(ch):
+            types.append("zh")
+        elif is_alphabet(ch):
+            types.append("en")
+        else:
+            types.append("other")
+    assert len(types) == len(text)
+    for i in range(len(types)):
+        # find the first char of the seg
+        if flag == 0:
+            temp_seg += text[i]
+            temp_lang = types[i]
+            flag = 1
+        else:
+            if temp_lang == "other":
+                if types[i] == temp_lang:
+                    temp_seg += text[i]
+                else:
+                    temp_seg += text[i]
+                    temp_lang = types[i]
+            else:
+                if types[i] == temp_lang:
+                    temp_seg += text[i]
+                elif types[i] == "other":
+                    temp_seg += text[i]
+                else:
+                    segments.append((temp_seg, temp_lang))
+                    temp_seg = text[i]
+                    temp_lang = types[i]
+                    flag = 1
+    segments.append((temp_seg, temp_lang))
+    return segments
+def chn_eng_g2p(text: str):
+    # now only en and ch
+    segments = get_segment(text)
+    all_phoneme = ""
+    all_tokens = []
+    for index in range(len(segments)):
+        seg = segments[index]
+        phoneme, token = g2p(seg[0], text, seg[1])
+        all_phoneme += phoneme + "|"
+        all_tokens += token
+        if seg[1] == "en" and index == len(segments) - 1 and all_phoneme[-2] == "_":
+            all_phoneme = all_phoneme[:-2]
+            all_tokens = all_tokens[:-1]
+    return all_phoneme, all_tokens
+text_tokenizer = PhonemeBpeTokenizer()
+with open("./src/YingMusicSinger/utils/f5_tts/g2p/g2p/vocab.json", "r") as f:
+    json_data = f.read()
+data = json.loads(json_data)
+vocab = data["vocab"]
+if __name__ == "__main__":
+    phone, token = chn_eng_g2p("你好，hello world")
+    phone, token = chn_eng_g2p(
+        "你好，hello world, Bonjour, 테스트 해 보겠습니다, 五月雨緑"
+    )
+    print(phone)
+    print(token)
+    # phone, token = text_tokenizer.tokenize("你好，hello world, Bonjour, 테스트 해 보겠습니다, 五月雨緑", "", "auto")
+    phone, token = text_tokenizer.tokenize("緑", "", "auto")
+    # phone, token = text_tokenizer.tokenize("आइए इसका परीक्षण करें", "", "auto")
+    # phone, token = text_tokenizer.tokenize("आइए इसका परीक्षण करें", "", "other")
+    print(phone)
+    print(token)

src/YingMusicSinger/utils/f5_tts/g2p/infer_dpo.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import argparse
+import json
+import os
+import random
+import numpy as np
+import torch
+from f5_tts.infer.utils_infer import load_checkpoint
+from f5_tts.model import CFM, DiT
+from f5_tts.model.alsp_lance.data.npydata import FloatData
+from f5_tts.model.alsp_lance.tools import LanceReader, LanceWriter
+from tqdm import tqdm
+filter_keyword_list = [
+    "纯音乐",
+    "编曲",
+    "作词",
+    "作曲",
+    "调音",
+    "制作人",
+    "录音师",
+]
+filter_full_list = ["music", "end"]
+def check_lyric(time: float, lyric: str):
+    if time < 0.1:
+        return False
+    for filter_keyword in filter_keyword_list:
+        if filter_keyword in lyric:
+            return False
+    for filter_full in filter_full_list:
+        if filter_full == lyric.strip().lower():
+            return False
+    if len(lyric) == 0:
+        return False
+    return True
+def parse_lyrics(lyrics: str):
+    lyrics_with_time = []
+    lyrics = lyrics.strip()
+    for line in lyrics.split("\n"):
+        try:
+            time, lyric = line[1:9], line[10:]
+            lyric = lyric.strip()
+            mins, secs = time.split(":")
+            secs = int(mins) * 60 + float(secs)
+            # print(lyric, check_lyric(secs, lyric))
+            if not check_lyric(secs, lyric):
+                continue
+            lyrics_with_time.append((secs, lyric))
+        except:
+            # traceback.print_exc()
+            continue
+            # print("error", line)
+    return lyrics_with_time
+class CNENTokenizer:
+    def __init__(self):
+        with open("./src/YingMusicSinger/utils/f5_tts/g2p/g2p/vocab.json", "r") as file:
+            self.phone2id: dict = json.load(file)["vocab"]
+        self.id2phone = {v: k for (k, v) in self.phone2id.items()}
+        from f5_tts.g2p.g2p_generation import chn_eng_g2p
+        self.tokenizer = chn_eng_g2p
+    def encode(self, text):
+        phone, token = self.tokenizer(text)
+        token = [x + 1 for x in token]
+        return token
+    def decode(self, token):
+        return "|".join([self.id2phone[x - 1] for x in token])
+def inference(
+    model,
+    cond,
+    text,
+    duration,
+    style_prompt,
+    style,
+    output_dir,
+    song_name,
+    ckpt_step,
+    start_time,
+    latent_pred_start_frame,
+    latent_pred_end_frame,
+    epoch,
+    cfg_strength,
+):
+    # import pdb; pdb.set_trace()
+    with torch.inference_mode():
+        generated, _ = model.sample(
+            cond=cond,
+            text=text,
+            duration=duration,
+            style_prompt=style_prompt,
+            steps=32,
+            cfg_strength=cfg_strength,
+            sway_sampling_coef=None,
+            start_time=start_time,
+            latent_pred_start_frame=latent_pred_start_frame,
+            latent_pred_end_frame=latent_pred_end_frame,
+        )
+        generated = generated.to(torch.float32)  # [b t d]
+        latent = generated.transpose(1, 2)  # [b d t]
+        latent = latent.detach().cpu.numpy()
+        return latent
+def get_style_prompt(device, song_name, song_name2ref_npy):
+    mulan_style_path = song_name2ref_npy[song_name]
+    mulan_stlye = np.load(mulan_style_path)
+    style_prompt = torch.from_numpy(mulan_stlye).to(device)  # [1, 512]
+    style_prompt = style_prompt.half()
+    return style_prompt
+def get_lrc_prompt(text, tokenizer, dit_model, max_secs):
+    max_frames = 2048
+    lyrics_shift = 2
+    sampling_rate = 44100
+    downsample_rate = 2048
+    pad_token_id = 0
+    comma_token_id = 1
+    period_token_id = 2
+    fsmin = -10
+    fsmax = 10
+    lrc_with_time = parse_lyrics(text)
+    modified_lrc_with_time = []
+    for i in range(len(lrc_with_time)):
+        time, line = lrc_with_time[i]
+        # line_token = self.tokenizer.encode(line)
+        line_token = tokenizer.encode(line)
+        modified_lrc_with_time.append((time, line_token))
+    lrc_with_time = modified_lrc_with_time
+    lrc_with_time = [
+        (time_start, line)
+        for (time_start, line) in lrc_with_time
+        if time_start < max_secs
+    ]
+    # latent_end_time = lrc_with_time[-1][0] if len(lrc_with_time) >= 1 else -1
+    lrc_with_time = lrc_with_time[:-1] if len(lrc_with_time) >= 1 else lrc_with_time
+    normalized_start_time = 0.0
+    lrc = torch.zeros((max_frames,), dtype=torch.long)
+    tokens_count = 0
+    last_end_pos = 0
+    for time_start, line in lrc_with_time:
+        tokens = [
+            token if token != period_token_id else comma_token_id for token in line
+        ] + [period_token_id]
+        tokens = torch.tensor(tokens, dtype=torch.long)
+        num_tokens = tokens.shape[0]
+        gt_frame_start = int(time_start * sampling_rate / downsample_rate)
+        frame_shift = random.randint(int(fsmin), int(fsmax))
+        frame_start = max(gt_frame_start - frame_shift, last_end_pos)
+        frame_len = min(num_tokens, max_frames - frame_start)
+        # print(gt_frame_start, frame_shift, frame_start, frame_len, tokens_count, last_end_pos, full_pos_emb.shape)
+        lrc[frame_start : frame_start + frame_len] = tokens[:frame_len]
+        tokens_count += num_tokens
+        last_end_pos = frame_start + frame_len
+    lrc_emb = lrc.unsqueeze(0).to(dit_model.device)
+    normalized_start_time = (
+        torch.tensor(normalized_start_time).unsqueeze(0).to(dit_model.device)
+    )
+    return lrc_emb, normalized_start_time
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-config", type=str, default=None)
+    parser.add_argument("--ckpt-path", type=str, default=None)
+    parser.add_argument("--output-dir", type=str, default=None)  # lance
+    parser.add_argument("--lrc-path", type=str, default=None)
+    parser.add_argument("--mulan-style-path", type=str, default=None)  # lance
+    parser.add_argument("--cfg-strength", type=float, default=None)
+    args = parser.parse_args()
+    lrc_path = args.lrc_path
+    cfg_strength = args.cfg_strength
+    style_path = args.mulan_style_path
+    with open(args.model_config) as f:
+        model_config = json.load(f)
+    model_cls = DiT
+    ckpt_path = args.ckpt_path
+    device = "cuda"
+    use_style_prompt = True
+    dit_model = CFM(
+        transformer=model_cls(
+            **model_config["model"], use_style_prompt=use_style_prompt
+        ),
+        num_channels=model_config["model"]["mel_dim"],
+        use_style_prompt=use_style_prompt,
+    )
+    dit_model = dit_model.to(device)
+    dit_model = load_checkpoint(dit_model, ckpt_path, device=device, use_ema=True)
+    lrc_tokenizer = CNENTokenizer()
+    sampling_rate = 44100
+    downsample_rate = 2048
+    max_frames = 2048
+    max_secs = max_frames / (sampling_rate / downsample_rate)
+    output_dir = args.output_dir
+    writer = LanceWriter(output_dir, target_cls=FloatData)
+    reader = LanceReader(style_path, target_cls=FloatData)
+    WRITE_INTERVAL = 500
+    latent_data = []
+    for id in tqdm(reader.get_ids()):
+        item = reader.get_datas_by_rowids(row_ids=[id._rowid])[0]
+        data_id = item.data_id
+        style_prompt = torch.from_numpy(item.data).to(device)
+        stlye_prompt = style_prompt.half()
+        lrc_path = os.path.join(lrc_path, f"{data_id}.lrc")
+        with (open(lrc_path), "r") as f:
+            lrc = [line.strip() for line in f.readlines()]
+        lrc_prompt, start_time = get_lrc_prompt(lrc, lrc_tokenizer, dit_model, max_secs)
+        latent_prompt = torch.zeros(1, max_frames, 64).to(device)
+        sf = 0
+        ef = max_frames
+        generated_latent = inference(
+            model=dit_model,
+            cond=latent_prompt,
+            text=lrc_prompt,
+            duration=max_frames,
+            style_prompt=style_prompt,
+            output_dir=output_dir,
+            start_time=start_time,
+            latent_pred_start_frame=sf,
+            latent_pred_end_frame=ef,
+            cfg_strength=cfg_strength,
+        )  # [b d t] numpy
+        latent_data.append(generated_latent)
+        if len(latent_data) > WRITE_INTERVAL:
+            writer.write_parallel(latent_data)
+            latent_data = []
+    writer.write_parallel(latent_data)

src/YingMusicSinger/utils/f5_tts/g2p/sources/bpmf_2_pinyin.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+b	ㄅ
+p	ㄆ
+m	ㄇ
+f	ㄈ
+d	ㄉ
+t	ㄊ
+n	ㄋ
+l	ㄌ
+g	ㄍ
+k	ㄎ
+h	ㄏ
+j	ㄐ
+q	ㄑ
+x	ㄒ
+zh	ㄓ
+ch	ㄔ
+sh	ㄕ
+r	ㄖ
+z	ㄗ
+c	ㄘ
+s	ㄙ
+i	ㄧ
+u	ㄨ
+v	ㄩ
+a	ㄚ
+o	ㄛ
+e	ㄜ
+e	ㄝ
+ai	ㄞ
+ei	ㄟ
+ao	ㄠ
+ou	ㄡ
+an	ㄢ
+en	ㄣ
+ang	ㄤ
+eng	ㄥ
+er	ㄦ
+2	ˊ
+3	ˇ
+4	ˋ
+0	˙

src/YingMusicSinger/utils/f5_tts/g2p/sources/chinese_lexicon.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3a7685d1c3e68eb2fa304bfc63e90c90c3c1a1948839a5b1b507b2131b3e2fb
+size 14779443

src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/config.json ADDED Viewed

	@@ -0,0 +1,819 @@

+{
+  "_name_or_path": "/BERT-POLY-v2/pretrained_models/mini_bert",
+  "architectures": [
+    "BertPoly"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23",
+    "24": "LABEL_24",
+    "25": "LABEL_25",
+    "26": "LABEL_26",
+    "27": "LABEL_27",
+    "28": "LABEL_28",
+    "29": "LABEL_29",
+    "30": "LABEL_30",
+    "31": "LABEL_31",
+    "32": "LABEL_32",
+    "33": "LABEL_33",
+    "34": "LABEL_34",
+    "35": "LABEL_35",
+    "36": "LABEL_36",
+    "37": "LABEL_37",
+    "38": "LABEL_38",
+    "39": "LABEL_39",
+    "40": "LABEL_40",
+    "41": "LABEL_41",
+    "42": "LABEL_42",
+    "43": "LABEL_43",
+    "44": "LABEL_44",
+    "45": "LABEL_45",
+    "46": "LABEL_46",
+    "47": "LABEL_47",
+    "48": "LABEL_48",
+    "49": "LABEL_49",
+    "50": "LABEL_50",
+    "51": "LABEL_51",
+    "52": "LABEL_52",
+    "53": "LABEL_53",
+    "54": "LABEL_54",
+    "55": "LABEL_55",
+    "56": "LABEL_56",
+    "57": "LABEL_57",
+    "58": "LABEL_58",
+    "59": "LABEL_59",
+    "60": "LABEL_60",
+    "61": "LABEL_61",
+    "62": "LABEL_62",
+    "63": "LABEL_63",
+    "64": "LABEL_64",
+    "65": "LABEL_65",
+    "66": "LABEL_66",
+    "67": "LABEL_67",
+    "68": "LABEL_68",
+    "69": "LABEL_69",
+    "70": "LABEL_70",
+    "71": "LABEL_71",
+    "72": "LABEL_72",
+    "73": "LABEL_73",
+    "74": "LABEL_74",
+    "75": "LABEL_75",
+    "76": "LABEL_76",
+    "77": "LABEL_77",
+    "78": "LABEL_78",
+    "79": "LABEL_79",
+    "80": "LABEL_80",
+    "81": "LABEL_81",
+    "82": "LABEL_82",
+    "83": "LABEL_83",
+    "84": "LABEL_84",
+    "85": "LABEL_85",
+    "86": "LABEL_86",
+    "87": "LABEL_87",
+    "88": "LABEL_88",
+    "89": "LABEL_89",
+    "90": "LABEL_90",
+    "91": "LABEL_91",
+    "92": "LABEL_92",
+    "93": "LABEL_93",
+    "94": "LABEL_94",
+    "95": "LABEL_95",
+    "96": "LABEL_96",
+    "97": "LABEL_97",
+    "98": "LABEL_98",
+    "99": "LABEL_99",
+    "100": "LABEL_100",
+    "101": "LABEL_101",
+    "102": "LABEL_102",
+    "103": "LABEL_103",
+    "104": "LABEL_104",
+    "105": "LABEL_105",
+    "106": "LABEL_106",
+    "107": "LABEL_107",
+    "108": "LABEL_108",
+    "109": "LABEL_109",
+    "110": "LABEL_110",
+    "111": "LABEL_111",
+    "112": "LABEL_112",
+    "113": "LABEL_113",
+    "114": "LABEL_114",
+    "115": "LABEL_115",
+    "116": "LABEL_116",
+    "117": "LABEL_117",
+    "118": "LABEL_118",
+    "119": "LABEL_119",
+    "120": "LABEL_120",
+    "121": "LABEL_121",
+    "122": "LABEL_122",
+    "123": "LABEL_123",
+    "124": "LABEL_124",
+    "125": "LABEL_125",
+    "126": "LABEL_126",
+    "127": "LABEL_127",
+    "128": "LABEL_128",
+    "129": "LABEL_129",
+    "130": "LABEL_130",
+    "131": "LABEL_131",
+    "132": "LABEL_132",
+    "133": "LABEL_133",
+    "134": "LABEL_134",
+    "135": "LABEL_135",
+    "136": "LABEL_136",
+    "137": "LABEL_137",
+    "138": "LABEL_138",
+    "139": "LABEL_139",
+    "140": "LABEL_140",
+    "141": "LABEL_141",
+    "142": "LABEL_142",
+    "143": "LABEL_143",
+    "144": "LABEL_144",
+    "145": "LABEL_145",
+    "146": "LABEL_146",
+    "147": "LABEL_147",
+    "148": "LABEL_148",
+    "149": "LABEL_149",
+    "150": "LABEL_150",
+    "151": "LABEL_151",
+    "152": "LABEL_152",
+    "153": "LABEL_153",
+    "154": "LABEL_154",
+    "155": "LABEL_155",
+    "156": "LABEL_156",
+    "157": "LABEL_157",
+    "158": "LABEL_158",
+    "159": "LABEL_159",
+    "160": "LABEL_160",
+    "161": "LABEL_161",
+    "162": "LABEL_162",
+    "163": "LABEL_163",
+    "164": "LABEL_164",
+    "165": "LABEL_165",
+    "166": "LABEL_166",
+    "167": "LABEL_167",
+    "168": "LABEL_168",
+    "169": "LABEL_169",
+    "170": "LABEL_170",
+    "171": "LABEL_171",
+    "172": "LABEL_172",
+    "173": "LABEL_173",
+    "174": "LABEL_174",
+    "175": "LABEL_175",
+    "176": "LABEL_176",
+    "177": "LABEL_177",
+    "178": "LABEL_178",
+    "179": "LABEL_179",
+    "180": "LABEL_180",
+    "181": "LABEL_181",
+    "182": "LABEL_182",
+    "183": "LABEL_183",
+    "184": "LABEL_184",
+    "185": "LABEL_185",
+    "186": "LABEL_186",
+    "187": "LABEL_187",
+    "188": "LABEL_188",
+    "189": "LABEL_189",
+    "190": "LABEL_190",
+    "191": "LABEL_191",
+    "192": "LABEL_192",
+    "193": "LABEL_193",
+    "194": "LABEL_194",
+    "195": "LABEL_195",
+    "196": "LABEL_196",
+    "197": "LABEL_197",
+    "198": "LABEL_198",
+    "199": "LABEL_199",
+    "200": "LABEL_200",
+    "201": "LABEL_201",
+    "202": "LABEL_202",
+    "203": "LABEL_203",
+    "204": "LABEL_204",
+    "205": "LABEL_205",
+    "206": "LABEL_206",
+    "207": "LABEL_207",
+    "208": "LABEL_208",
+    "209": "LABEL_209",
+    "210": "LABEL_210",
+    "211": "LABEL_211",
+    "212": "LABEL_212",
+    "213": "LABEL_213",
+    "214": "LABEL_214",
+    "215": "LABEL_215",
+    "216": "LABEL_216",
+    "217": "LABEL_217",
+    "218": "LABEL_218",
+    "219": "LABEL_219",
+    "220": "LABEL_220",
+    "221": "LABEL_221",
+    "222": "LABEL_222",
+    "223": "LABEL_223",
+    "224": "LABEL_224",
+    "225": "LABEL_225",
+    "226": "LABEL_226",
+    "227": "LABEL_227",
+    "228": "LABEL_228",
+    "229": "LABEL_229",
+    "230": "LABEL_230",
+    "231": "LABEL_231",
+    "232": "LABEL_232",
+    "233": "LABEL_233",
+    "234": "LABEL_234",
+    "235": "LABEL_235",
+    "236": "LABEL_236",
+    "237": "LABEL_237",
+    "238": "LABEL_238",
+    "239": "LABEL_239",
+    "240": "LABEL_240",
+    "241": "LABEL_241",
+    "242": "LABEL_242",
+    "243": "LABEL_243",
+    "244": "LABEL_244",
+    "245": "LABEL_245",
+    "246": "LABEL_246",
+    "247": "LABEL_247",
+    "248": "LABEL_248",
+    "249": "LABEL_249",
+    "250": "LABEL_250",
+    "251": "LABEL_251",
+    "252": "LABEL_252",
+    "253": "LABEL_253",
+    "254": "LABEL_254",
+    "255": "LABEL_255",
+    "256": "LABEL_256",
+    "257": "LABEL_257",
+    "258": "LABEL_258",
+    "259": "LABEL_259",
+    "260": "LABEL_260",
+    "261": "LABEL_261",
+    "262": "LABEL_262",
+    "263": "LABEL_263",
+    "264": "LABEL_264",
+    "265": "LABEL_265",
+    "266": "LABEL_266",
+    "267": "LABEL_267",
+    "268": "LABEL_268",
+    "269": "LABEL_269",
+    "270": "LABEL_270",
+    "271": "LABEL_271",
+    "272": "LABEL_272",
+    "273": "LABEL_273",
+    "274": "LABEL_274",
+    "275": "LABEL_275",
+    "276": "LABEL_276",
+    "277": "LABEL_277",
+    "278": "LABEL_278",
+    "279": "LABEL_279",
+    "280": "LABEL_280",
+    "281": "LABEL_281",
+    "282": "LABEL_282",
+    "283": "LABEL_283",
+    "284": "LABEL_284",
+    "285": "LABEL_285",
+    "286": "LABEL_286",
+    "287": "LABEL_287",
+    "288": "LABEL_288",
+    "289": "LABEL_289",
+    "290": "LABEL_290",
+    "291": "LABEL_291",
+    "292": "LABEL_292",
+    "293": "LABEL_293",
+    "294": "LABEL_294",
+    "295": "LABEL_295",
+    "296": "LABEL_296",
+    "297": "LABEL_297",
+    "298": "LABEL_298",
+    "299": "LABEL_299",
+    "300": "LABEL_300",
+    "301": "LABEL_301",
+    "302": "LABEL_302",
+    "303": "LABEL_303",
+    "304": "LABEL_304",
+    "305": "LABEL_305",
+    "306": "LABEL_306",
+    "307": "LABEL_307",
+    "308": "LABEL_308",
+    "309": "LABEL_309",
+    "310": "LABEL_310",
+    "311": "LABEL_311",
+    "312": "LABEL_312",
+    "313": "LABEL_313",
+    "314": "LABEL_314",
+    "315": "LABEL_315",
+    "316": "LABEL_316",
+    "317": "LABEL_317",
+    "318": "LABEL_318",
+    "319": "LABEL_319",
+    "320": "LABEL_320",
+    "321": "LABEL_321",
+    "322": "LABEL_322",
+    "323": "LABEL_323",
+    "324": "LABEL_324",
+    "325": "LABEL_325",
+    "326": "LABEL_326",
+    "327": "LABEL_327",
+    "328": "LABEL_328",
+    "329": "LABEL_329",
+    "330": "LABEL_330",
+    "331": "LABEL_331",
+    "332": "LABEL_332",
+    "333": "LABEL_333",
+    "334": "LABEL_334",
+    "335": "LABEL_335",
+    "336": "LABEL_336",
+    "337": "LABEL_337",
+    "338": "LABEL_338",
+    "339": "LABEL_339",
+    "340": "LABEL_340",
+    "341": "LABEL_341",
+    "342": "LABEL_342",
+    "343": "LABEL_343",
+    "344": "LABEL_344",
+    "345": "LABEL_345",
+    "346": "LABEL_346",
+    "347": "LABEL_347",
+    "348": "LABEL_348",
+    "349": "LABEL_349",
+    "350": "LABEL_350",
+    "351": "LABEL_351",
+    "352": "LABEL_352",
+    "353": "LABEL_353",
+    "354": "LABEL_354",
+    "355": "LABEL_355",
+    "356": "LABEL_356",
+    "357": "LABEL_357",
+    "358": "LABEL_358",
+    "359": "LABEL_359",
+    "360": "LABEL_360",
+    "361": "LABEL_361",
+    "362": "LABEL_362",
+    "363": "LABEL_363",
+    "364": "LABEL_364",
+    "365": "LABEL_365",
+    "366": "LABEL_366",
+    "367": "LABEL_367",
+    "368": "LABEL_368",
+    "369": "LABEL_369",
+    "370": "LABEL_370",
+    "371": "LABEL_371",
+    "372": "LABEL_372",
+    "373": "LABEL_373",
+    "374": "LABEL_374",
+    "375": "LABEL_375",
+    "376": "LABEL_376",
+    "377": "LABEL_377",
+    "378": "LABEL_378",
+    "379": "LABEL_379",
+    "380": "LABEL_380",
+    "381": "LABEL_381",
+    "382": "LABEL_382",
+    "383": "LABEL_383",
+    "384": "LABEL_384",
+    "385": "LABEL_385",
+    "386": "LABEL_386",
+    "387": "LABEL_387",
+    "388": "LABEL_388",
+    "389": "LABEL_389",
+    "390": "LABEL_390"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_100": 100,
+    "LABEL_101": 101,
+    "LABEL_102": 102,
+    "LABEL_103": 103,
+    "LABEL_104": 104,
+    "LABEL_105": 105,
+    "LABEL_106": 106,
+    "LABEL_107": 107,
+    "LABEL_108": 108,
+    "LABEL_109": 109,
+    "LABEL_11": 11,
+    "LABEL_110": 110,
+    "LABEL_111": 111,
+    "LABEL_112": 112,
+    "LABEL_113": 113,
+    "LABEL_114": 114,
+    "LABEL_115": 115,
+    "LABEL_116": 116,
+    "LABEL_117": 117,
+    "LABEL_118": 118,
+    "LABEL_119": 119,
+    "LABEL_12": 12,
+    "LABEL_120": 120,
+    "LABEL_121": 121,
+    "LABEL_122": 122,
+    "LABEL_123": 123,
+    "LABEL_124": 124,
+    "LABEL_125": 125,
+    "LABEL_126": 126,
+    "LABEL_127": 127,
+    "LABEL_128": 128,
+    "LABEL_129": 129,
+    "LABEL_13": 13,
+    "LABEL_130": 130,
+    "LABEL_131": 131,
+    "LABEL_132": 132,
+    "LABEL_133": 133,
+    "LABEL_134": 134,
+    "LABEL_135": 135,
+    "LABEL_136": 136,
+    "LABEL_137": 137,
+    "LABEL_138": 138,
+    "LABEL_139": 139,
+    "LABEL_14": 14,
+    "LABEL_140": 140,
+    "LABEL_141": 141,
+    "LABEL_142": 142,
+    "LABEL_143": 143,
+    "LABEL_144": 144,
+    "LABEL_145": 145,
+    "LABEL_146": 146,
+    "LABEL_147": 147,
+    "LABEL_148": 148,
+    "LABEL_149": 149,
+    "LABEL_15": 15,
+    "LABEL_150": 150,
+    "LABEL_151": 151,
+    "LABEL_152": 152,
+    "LABEL_153": 153,
+    "LABEL_154": 154,
+    "LABEL_155": 155,
+    "LABEL_156": 156,
+    "LABEL_157": 157,
+    "LABEL_158": 158,
+    "LABEL_159": 159,
+    "LABEL_16": 16,
+    "LABEL_160": 160,
+    "LABEL_161": 161,
+    "LABEL_162": 162,
+    "LABEL_163": 163,
+    "LABEL_164": 164,
+    "LABEL_165": 165,
+    "LABEL_166": 166,
+    "LABEL_167": 167,
+    "LABEL_168": 168,
+    "LABEL_169": 169,
+    "LABEL_17": 17,
+    "LABEL_170": 170,
+    "LABEL_171": 171,
+    "LABEL_172": 172,
+    "LABEL_173": 173,
+    "LABEL_174": 174,
+    "LABEL_175": 175,
+    "LABEL_176": 176,
+    "LABEL_177": 177,
+    "LABEL_178": 178,
+    "LABEL_179": 179,
+    "LABEL_18": 18,
+    "LABEL_180": 180,
+    "LABEL_181": 181,
+    "LABEL_182": 182,
+    "LABEL_183": 183,
+    "LABEL_184": 184,
+    "LABEL_185": 185,
+    "LABEL_186": 186,
+    "LABEL_187": 187,
+    "LABEL_188": 188,
+    "LABEL_189": 189,
+    "LABEL_19": 19,
+    "LABEL_190": 190,
+    "LABEL_191": 191,
+    "LABEL_192": 192,
+    "LABEL_193": 193,
+    "LABEL_194": 194,
+    "LABEL_195": 195,
+    "LABEL_196": 196,
+    "LABEL_197": 197,
+    "LABEL_198": 198,
+    "LABEL_199": 199,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_200": 200,
+    "LABEL_201": 201,
+    "LABEL_202": 202,
+    "LABEL_203": 203,
+    "LABEL_204": 204,
+    "LABEL_205": 205,
+    "LABEL_206": 206,
+    "LABEL_207": 207,
+    "LABEL_208": 208,
+    "LABEL_209": 209,
+    "LABEL_21": 21,
+    "LABEL_210": 210,
+    "LABEL_211": 211,
+    "LABEL_212": 212,
+    "LABEL_213": 213,
+    "LABEL_214": 214,
+    "LABEL_215": 215,
+    "LABEL_216": 216,
+    "LABEL_217": 217,
+    "LABEL_218": 218,
+    "LABEL_219": 219,
+    "LABEL_22": 22,
+    "LABEL_220": 220,
+    "LABEL_221": 221,
+    "LABEL_222": 222,
+    "LABEL_223": 223,
+    "LABEL_224": 224,
+    "LABEL_225": 225,
+    "LABEL_226": 226,
+    "LABEL_227": 227,
+    "LABEL_228": 228,
+    "LABEL_229": 229,
+    "LABEL_23": 23,
+    "LABEL_230": 230,
+    "LABEL_231": 231,
+    "LABEL_232": 232,
+    "LABEL_233": 233,
+    "LABEL_234": 234,
+    "LABEL_235": 235,
+    "LABEL_236": 236,
+    "LABEL_237": 237,
+    "LABEL_238": 238,
+    "LABEL_239": 239,
+    "LABEL_24": 24,
+    "LABEL_240": 240,
+    "LABEL_241": 241,
+    "LABEL_242": 242,
+    "LABEL_243": 243,
+    "LABEL_244": 244,
+    "LABEL_245": 245,
+    "LABEL_246": 246,
+    "LABEL_247": 247,
+    "LABEL_248": 248,
+    "LABEL_249": 249,
+    "LABEL_25": 25,
+    "LABEL_250": 250,
+    "LABEL_251": 251,
+    "LABEL_252": 252,
+    "LABEL_253": 253,
+    "LABEL_254": 254,
+    "LABEL_255": 255,
+    "LABEL_256": 256,
+    "LABEL_257": 257,
+    "LABEL_258": 258,
+    "LABEL_259": 259,
+    "LABEL_26": 26,
+    "LABEL_260": 260,
+    "LABEL_261": 261,
+    "LABEL_262": 262,
+    "LABEL_263": 263,
+    "LABEL_264": 264,
+    "LABEL_265": 265,
+    "LABEL_266": 266,
+    "LABEL_267": 267,
+    "LABEL_268": 268,
+    "LABEL_269": 269,
+    "LABEL_27": 27,
+    "LABEL_270": 270,
+    "LABEL_271": 271,
+    "LABEL_272": 272,
+    "LABEL_273": 273,
+    "LABEL_274": 274,
+    "LABEL_275": 275,
+    "LABEL_276": 276,
+    "LABEL_277": 277,
+    "LABEL_278": 278,
+    "LABEL_279": 279,
+    "LABEL_28": 28,
+    "LABEL_280": 280,
+    "LABEL_281": 281,
+    "LABEL_282": 282,
+    "LABEL_283": 283,
+    "LABEL_284": 284,
+    "LABEL_285": 285,
+    "LABEL_286": 286,
+    "LABEL_287": 287,
+    "LABEL_288": 288,
+    "LABEL_289": 289,
+    "LABEL_29": 29,
+    "LABEL_290": 290,
+    "LABEL_291": 291,
+    "LABEL_292": 292,
+    "LABEL_293": 293,
+    "LABEL_294": 294,
+    "LABEL_295": 295,
+    "LABEL_296": 296,
+    "LABEL_297": 297,
+    "LABEL_298": 298,
+    "LABEL_299": 299,
+    "LABEL_3": 3,
+    "LABEL_30": 30,
+    "LABEL_300": 300,
+    "LABEL_301": 301,
+    "LABEL_302": 302,
+    "LABEL_303": 303,
+    "LABEL_304": 304,
+    "LABEL_305": 305,
+    "LABEL_306": 306,
+    "LABEL_307": 307,
+    "LABEL_308": 308,
+    "LABEL_309": 309,
+    "LABEL_31": 31,
+    "LABEL_310": 310,
+    "LABEL_311": 311,
+    "LABEL_312": 312,
+    "LABEL_313": 313,
+    "LABEL_314": 314,
+    "LABEL_315": 315,
+    "LABEL_316": 316,
+    "LABEL_317": 317,
+    "LABEL_318": 318,
+    "LABEL_319": 319,
+    "LABEL_32": 32,
+    "LABEL_320": 320,
+    "LABEL_321": 321,
+    "LABEL_322": 322,
+    "LABEL_323": 323,
+    "LABEL_324": 324,
+    "LABEL_325": 325,
+    "LABEL_326": 326,
+    "LABEL_327": 327,
+    "LABEL_328": 328,
+    "LABEL_329": 329,
+    "LABEL_33": 33,
+    "LABEL_330": 330,
+    "LABEL_331": 331,
+    "LABEL_332": 332,
+    "LABEL_333": 333,
+    "LABEL_334": 334,
+    "LABEL_335": 335,
+    "LABEL_336": 336,
+    "LABEL_337": 337,
+    "LABEL_338": 338,
+    "LABEL_339": 339,
+    "LABEL_34": 34,
+    "LABEL_340": 340,
+    "LABEL_341": 341,
+    "LABEL_342": 342,
+    "LABEL_343": 343,
+    "LABEL_344": 344,
+    "LABEL_345": 345,
+    "LABEL_346": 346,
+    "LABEL_347": 347,
+    "LABEL_348": 348,
+    "LABEL_349": 349,
+    "LABEL_35": 35,
+    "LABEL_350": 350,
+    "LABEL_351": 351,
+    "LABEL_352": 352,
+    "LABEL_353": 353,
+    "LABEL_354": 354,
+    "LABEL_355": 355,
+    "LABEL_356": 356,
+    "LABEL_357": 357,
+    "LABEL_358": 358,
+    "LABEL_359": 359,
+    "LABEL_36": 36,
+    "LABEL_360": 360,
+    "LABEL_361": 361,
+    "LABEL_362": 362,
+    "LABEL_363": 363,
+    "LABEL_364": 364,
+    "LABEL_365": 365,
+    "LABEL_366": 366,
+    "LABEL_367": 367,
+    "LABEL_368": 368,
+    "LABEL_369": 369,
+    "LABEL_37": 37,
+    "LABEL_370": 370,
+    "LABEL_371": 371,
+    "LABEL_372": 372,
+    "LABEL_373": 373,
+    "LABEL_374": 374,
+    "LABEL_375": 375,
+    "LABEL_376": 376,
+    "LABEL_377": 377,
+    "LABEL_378": 378,
+    "LABEL_379": 379,
+    "LABEL_38": 38,
+    "LABEL_380": 380,
+    "LABEL_381": 381,
+    "LABEL_382": 382,
+    "LABEL_383": 383,
+    "LABEL_384": 384,
+    "LABEL_385": 385,
+    "LABEL_386": 386,
+    "LABEL_387": 387,
+    "LABEL_388": 388,
+    "LABEL_389": 389,
+    "LABEL_39": 39,
+    "LABEL_390": 390,
+    "LABEL_4": 4,
+    "LABEL_40": 40,
+    "LABEL_41": 41,
+    "LABEL_42": 42,
+    "LABEL_43": 43,
+    "LABEL_44": 44,
+    "LABEL_45": 45,
+    "LABEL_46": 46,
+    "LABEL_47": 47,
+    "LABEL_48": 48,
+    "LABEL_49": 49,
+    "LABEL_5": 5,
+    "LABEL_50": 50,
+    "LABEL_51": 51,
+    "LABEL_52": 52,
+    "LABEL_53": 53,
+    "LABEL_54": 54,
+    "LABEL_55": 55,
+    "LABEL_56": 56,
+    "LABEL_57": 57,
+    "LABEL_58": 58,
+    "LABEL_59": 59,
+    "LABEL_6": 6,
+    "LABEL_60": 60,
+    "LABEL_61": 61,
+    "LABEL_62": 62,
+    "LABEL_63": 63,
+    "LABEL_64": 64,
+    "LABEL_65": 65,
+    "LABEL_66": 66,
+    "LABEL_67": 67,
+    "LABEL_68": 68,
+    "LABEL_69": 69,
+    "LABEL_7": 7,
+    "LABEL_70": 70,
+    "LABEL_71": 71,
+    "LABEL_72": 72,
+    "LABEL_73": 73,
+    "LABEL_74": 74,
+    "LABEL_75": 75,
+    "LABEL_76": 76,
+    "LABEL_77": 77,
+    "LABEL_78": 78,
+    "LABEL_79": 79,
+    "LABEL_8": 8,
+    "LABEL_80": 80,
+    "LABEL_81": 81,
+    "LABEL_82": 82,
+    "LABEL_83": 83,
+    "LABEL_84": 84,
+    "LABEL_85": 85,
+    "LABEL_86": 86,
+    "LABEL_87": 87,
+    "LABEL_88": 88,
+    "LABEL_89": 89,
+    "LABEL_9": 9,
+    "LABEL_90": 90,
+    "LABEL_91": 91,
+    "LABEL_92": 92,
+    "LABEL_93": 93,
+    "LABEL_94": 94,
+    "LABEL_95": 95,
+    "LABEL_96": 96,
+    "LABEL_97": 97,
+    "LABEL_98": 98,
+    "LABEL_99": 99
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "num_relation_heads": 32,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128
+}

src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/poly_bert_model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8765d835ffdf9811c832d4dc7b6a552757aa8615c01d1184db716a50c20aebbc
+size 76583333

src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/polychar.txt ADDED Viewed

	@@ -0,0 +1,159 @@

+丧
+中
+为
+乌
+乐
+了
+什
+仔
+令
+任
+会
+传
+佛
+供
+便
+倒
+假
+兴
+冠
+冲
+几
+分
+切
+划
+创
+剥
+勒
+区
+华
+单
+卜
+占
+卡
+卷
+厦
+参
+发
+只
+号
+同
+吐
+和
+喝
+圈
+地
+塞
+壳
+处
+奇
+奔
+好
+宁
+宿
+将
+少
+尽
+岗
+差
+巷
+帖
+干
+应
+度
+弹
+强
+当
+待
+得
+恶
+扁
+扇
+扎
+扫
+担
+挑
+据
+撒
+教
+散
+数
+斗
+晃
+曝
+曲
+更
+曾
+朝
+朴
+杆
+查
+校
+模
+横
+没
+泡
+济
+混
+漂
+炸
+熟
+燕
+片
+率
+畜
+的
+盛
+相
+省
+看
+着
+矫
+禁
+种
+称
+空
+答
+粘
+糊
+系
+累
+纤
+结
+给
+缝
+肖
+背
+脏
+舍
+色
+落
+蒙
+薄
+藏
+血
+行
+要
+观
+觉
+角
+解
+说
+调
+踏
+车
+转
+载
+还
+遂
+都
+重
+量
+钻
+铺
+长
+间
+降
+难
+露
+鲜

src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/polydict.json ADDED Viewed

	@@ -0,0 +1,393 @@

+{
+ "1": "丧{sang1}",
+ "2": "丧{sang4}",
+ "3": "中{zhong1}",
+ "4": "中{zhong4}",
+ "5": "为{wei2}",
+ "6": "为{wei4}",
+ "7": "乌{wu1}",
+ "8": "乌{wu4}",
+ "9": "乐{lao4}",
+ "10": "乐{le4}",
+ "11": "乐{le5}",
+ "12": "乐{yao4}",
+ "13": "乐{yve4}",
+ "14": "了{le5}",
+ "15": "了{liao3}",
+ "16": "了{liao5}",
+ "17": "什{shen2}",
+ "18": "什{shi2}",
+ "19": "仔{zai3}",
+ "20": "仔{zai5}",
+ "21": "仔{zi3}",
+ "22": "仔{zi5}",
+ "23": "令{ling2}",
+ "24": "令{ling4}",
+ "25": "任{ren2}",
+ "26": "任{ren4}",
+ "27": "会{hui4}",
+ "28": "会{hui5}",
+ "29": "会{kuai4}",
+ "30": "传{chuan2}",
+ "31": "传{zhuan4}",
+ "32": "佛{fo2}",
+ "33": "佛{fu2}",
+ "34": "供{gong1}",
+ "35": "供{gong4}",
+ "36": "便{bian4}",
+ "37": "便{pian2}",
+ "38": "倒{dao3}",
+ "39": "倒{dao4}",
+ "40": "假{jia3}",
+ "41": "假{jia4}",
+ "42": "兴{xing1}",
+ "43": "兴{xing4}",
+ "44": "冠{guan1}",
+ "45": "冠{guan4}",
+ "46": "冲{chong1}",
+ "47": "冲{chong4}",
+ "48": "几{ji1}",
+ "49": "几{ji2}",
+ "50": "几{ji3}",
+ "51": "分{fen1}",
+ "52": "分{fen4}",
+ "53": "分{fen5}",
+ "54": "切{qie1}",
+ "55": "切{qie4}",
+ "56": "划{hua2}",
+ "57": "划{hua4}",
+ "58": "划{hua5}",
+ "59": "创{chuang1}",
+ "60": "创{chuang4}",
+ "61": "剥{bao1}",
+ "62": "剥{bo1}",
+ "63": "勒{le4}",
+ "64": "勒{le5}",
+ "65": "勒{lei1}",
+ "66": "区{ou1}",
+ "67": "区{qu1}",
+ "68": "华{hua2}",
+ "69": "华{hua4}",
+ "70": "单{chan2}",
+ "71": "单{dan1}",
+ "72": "单{shan4}",
+ "73": "卜{bo5}",
+ "74": "卜{bu3}",
+ "75": "占{zhan1}",
+ "76": "占{zhan4}",
+ "77": "卡{ka2}",
+ "78": "卡{ka3}",
+ "79": "卡{qia3}",
+ "80": "卷{jvan3}",
+ "81": "卷{jvan4}",
+ "82": "厦{sha4}",
+ "83": "厦{xia4}",
+ "84": "参{can1}",
+ "85": "参{cen1}",
+ "86": "参{shen1}",
+ "87": "发{fa1}",
+ "88": "发{fa4}",
+ "89": "发{fa5}",
+ "90": "只{zhi1}",
+ "91": "只{zhi3}",
+ "92": "号{hao2}",
+ "93": "号{hao4}",
+ "94": "号{hao5}",
+ "95": "同{tong2}",
+ "96": "同{tong4}",
+ "97": "同{tong5}",
+ "98": "吐{tu2}",
+ "99": "吐{tu3}",
+ "100": "吐{tu4}",
+ "101": "和{he2}",
+ "102": "和{he4}",
+ "103": "和{he5}",
+ "104": "和{huo2}",
+ "105": "和{huo4}",
+ "106": "和{huo5}",
+ "107": "喝{he1}",
+ "108": "喝{he4}",
+ "109": "圈{jvan4}",
+ "110": "圈{qvan1}",
+ "111": "圈{qvan5}",
+ "112": "地{de5}",
+ "113": "地{di4}",
+ "114": "地{di5}",
+ "115": "塞{sai1}",
+ "116": "塞{sai2}",
+ "117": "塞{sai4}",
+ "118": "塞{se4}",
+ "119": "壳{ke2}",
+ "120": "壳{qiao4}",
+ "121": "处{chu3}",
+ "122": "处{chu4}",
+ "123": "奇{ji1}",
+ "124": "奇{qi2}",
+ "125": "奔{ben1}",
+ "126": "奔{ben4}",
+ "127": "好{hao3}",
+ "128": "好{hao4}",
+ "129": "好{hao5}",
+ "130": "宁{ning2}",
+ "131": "宁{ning4}",
+ "132": "宁{ning5}",
+ "133": "宿{su4}",
+ "134": "宿{xiu3}",
+ "135": "宿{xiu4}",
+ "136": "将{jiang1}",
+ "137": "将{jiang4}",
+ "138": "少{shao3}",
+ "139": "少{shao4}",
+ "140": "尽{jin3}",
+ "141": "尽{jin4}",
+ "142": "岗{gang1}",
+ "143": "岗{gang3}",
+ "144": "差{cha1}",
+ "145": "差{cha4}",
+ "146": "差{chai1}",
+ "147": "差{ci1}",
+ "148": "巷{hang4}",
+ "149": "巷{xiang4}",
+ "150": "帖{tie1}",
+ "151": "帖{tie3}",
+ "152": "帖{tie4}",
+ "153": "干{gan1}",
+ "154": "干{gan4}",
+ "155": "应{ying1}",
+ "156": "应{ying4}",
+ "157": "应{ying5}",
+ "158": "度{du4}",
+ "159": "度{du5}",
+ "160": "度{duo2}",
+ "161": "弹{dan4}",
+ "162": "弹{tan2}",
+ "163": "弹{tan5}",
+ "164": "强{jiang4}",
+ "165": "强{qiang2}",
+ "166": "强{qiang3}",
+ "167": "当{dang1}",
+ "168": "当{dang4}",
+ "169": "当{dang5}",
+ "170": "待{dai1}",
+ "171": "待{dai4}",
+ "172": "得{de2}",
+ "173": "得{de5}",
+ "174": "得{dei3}",
+ "175": "得{dei5}",
+ "176": "恶{e3}",
+ "177": "恶{e4}",
+ "178": "恶{wu4}",
+ "179": "扁{bian3}",
+ "180": "扁{pian1}",
+ "181": "扇{shan1}",
+ "182": "扇{shan4}",
+ "183": "扎{za1}",
+ "184": "扎{zha1}",
+ "185": "扎{zha2}",
+ "186": "扫{sao3}",
+ "187": "扫{sao4}",
+ "188": "担{dan1}",
+ "189": "担{dan4}",
+ "190": "担{dan5}",
+ "191": "挑{tiao1}",
+ "192": "挑{tiao3}",
+ "193": "据{jv1}",
+ "194": "据{jv4}",
+ "195": "撒{sa1}",
+ "196": "撒{sa3}",
+ "197": "撒{sa5}",
+ "198": "教{jiao1}",
+ "199": "教{jiao4}",
+ "200": "散{san3}",
+ "201": "散{san4}",
+ "202": "散{san5}",
+ "203": "数{shu3}",
+ "204": "数{shu4}",
+ "205": "数{shu5}",
+ "206": "斗{dou3}",
+ "207": "斗{dou4}",
+ "208": "晃{huang3}",
+ "209": "曝{bao4}",
+ "210": "曲{qu1}",
+ "211": "曲{qu3}",
+ "212": "更{geng1}",
+ "213": "更{geng4}",
+ "214": "曾{ceng1}",
+ "215": "曾{ceng2}",
+ "216": "曾{zeng1}",
+ "217": "朝{chao2}",
+ "218": "朝{zhao1}",
+ "219": "朴{piao2}",
+ "220": "朴{pu2}",
+ "221": "朴{pu3}",
+ "222": "杆{gan1}",
+ "223": "杆{gan3}",
+ "224": "查{cha2}",
+ "225": "查{zha1}",
+ "226": "校{jiao4}",
+ "227": "校{xiao4}",
+ "228": "模{mo2}",
+ "229": "模{mu2}",
+ "230": "横{heng2}",
+ "231": "横{heng4}",
+ "232": "没{mei2}",
+ "233": "没{mo4}",
+ "234": "泡{pao1}",
+ "235": "泡{pao4}",
+ "236": "泡{pao5}",
+ "237": "济{ji3}",
+ "238": "济{ji4}",
+ "239": "混{hun2}",
+ "240": "混{hun3}",
+ "241": "混{hun4}",
+ "242": "混{hun5}",
+ "243": "漂{piao1}",
+ "244": "漂{piao3}",
+ "245": "漂{piao4}",
+ "246": "炸{zha2}",
+ "247": "炸{zha4}",
+ "248": "熟{shou2}",
+ "249": "熟{shu2}",
+ "250": "燕{yan1}",
+ "251": "燕{yan4}",
+ "252": "片{pian1}",
+ "253": "片{pian4}",
+ "254": "率{lv4}",
+ "255": "率{shuai4}",
+ "256": "畜{chu4}",
+ "257": "畜{xu4}",
+ "258": "的{de5}",
+ "259": "的{di1}",
+ "260": "的{di2}",
+ "261": "的{di4}",
+ "262": "的{di5}",
+ "263": "盛{cheng2}",
+ "264": "盛{sheng4}",
+ "265": "相{xiang1}",
+ "266": "相{xiang4}",
+ "267": "相{xiang5}",
+ "268": "省{sheng3}",
+ "269": "省{xing3}",
+ "270": "看{kan1}",
+ "271": "看{kan4}",
+ "272": "看{kan5}",
+ "273": "着{zhao1}",
+ "274": "着{zhao2}",
+ "275": "着{zhao5}",
+ "276": "着{zhe5}",
+ "277": "着{zhuo2}",
+ "278": "着{zhuo5}",
+ "279": "矫{jiao3}",
+ "280": "禁{jin1}",
+ "281": "禁{jin4}",
+ "282": "种{zhong3}",
+ "283": "种{zhong4}",
+ "284": "称{chen4}",
+ "285": "称{cheng1}",
+ "286": "空{kong1}",
+ "287": "空{kong4}",
+ "288": "答{da1}",
+ "289": "答{da2}",
+ "290": "粘{nian2}",
+ "291": "粘{zhan1}",
+ "292": "糊{hu2}",
+ "293": "糊{hu5}",
+ "294": "系{ji4}",
+ "295": "系{xi4}",
+ "296": "系{xi5}",
+ "297": "累{lei2}",
+ "298": "累{lei3}",
+ "299": "累{lei4}",
+ "300": "累{lei5}",
+ "301": "纤{qian4}",
+ "302": "纤{xian1}",
+ "303": "结{jie1}",
+ "304": "结{jie2}",
+ "305": "结{jie5}",
+ "306": "给{gei3}",
+ "307": "给{gei5}",
+ "308": "给{ji3}",
+ "309": "缝{feng2}",
+ "310": "缝{feng4}",
+ "311": "缝{feng5}",
+ "312": "肖{xiao1}",
+ "313": "肖{xiao4}",
+ "314": "背{bei1}",
+ "315": "背{bei4}",
+ "316": "脏{zang1}",
+ "317": "脏{zang4}",
+ "318": "舍{she3}",
+ "319": "舍{she4}",
+ "320": "色{se4}",
+ "321": "色{shai3}",
+ "322": "落{lao4}",
+ "323": "落{luo4}",
+ "324": "蒙{meng1}",
+ "325": "蒙{meng2}",
+ "326": "蒙{meng3}",
+ "327": "薄{bao2}",
+ "328": "薄{bo2}",
+ "329": "薄{bo4}",
+ "330": "藏{cang2}",
+ "331": "藏{zang4}",
+ "332": "血{xie3}",
+ "333": "血{xue4}",
+ "334": "行{hang2}",
+ "335": "行{hang5}",
+ "336": "行{heng5}",
+ "337": "行{xing2}",
+ "338": "行{xing4}",
+ "339": "要{yao1}",
+ "340": "要{yao4}",
+ "341": "观{guan1}",
+ "342": "观{guan4}",
+ "343": "觉{jiao4}",
+ "344": "觉{jiao5}",
+ "345": "觉{jve2}",
+ "346": "角{jiao3}",
+ "347": "角{jve2}",
+ "348": "解{jie3}",
+ "349": "解{jie4}",
+ "350": "解{xie4}",
+ "351": "说{shui4}",
+ "352": "说{shuo1}",
+ "353": "调{diao4}",
+ "354": "调{tiao2}",
+ "355": "踏{ta1}",
+ "356": "踏{ta4}",
+ "357": "车{che1}",
+ "358": "车{jv1}",
+ "359": "转{zhuan3}",
+ "360": "转{zhuan4}",
+ "361": "载{zai3}",
+ "362": "载{zai4}",
+ "363": "还{hai2}",
+ "364": "还{huan2}",
+ "365": "遂{sui2}",
+ "366": "遂{sui4}",
+ "367": "都{dou1}",
+ "368": "都{du1}",
+ "369": "重{chong2}",
+ "370": "重{zhong4}",
+ "371": "量{liang2}",
+ "372": "量{liang4}",
+ "373": "量{liang5}",
+ "374": "钻{zuan1}",
+ "375": "钻{zuan4}",
+ "376": "铺{pu1}",
+ "377": "铺{pu4}",
+ "378": "长{chang2}",
+ "379": "长{chang3}",
+ "380": "长{zhang3}",
+ "381": "间{jian1}",
+ "382": "间{jian4}",
+ "383": "降{jiang4}",
+ "384": "降{xiang2}",
+ "385": "难{nan2}",
+ "386": "难{nan4}",
+ "387": "难{nan5}",
+ "388": "露{lou4}",
+ "389": "露{lu4}",
+ "390": "鲜{xian1}",
+ "391": "鲜{xian3}"
+}

src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/polydict_r.json ADDED Viewed

	@@ -0,0 +1,393 @@

+{
+ "丧{sang1}": 1,
+ "丧{sang4}": 2,
+ "中{zhong1}": 3,
+ "中{zhong4}": 4,
+ "为{wei2}": 5,
+ "为{wei4}": 6,
+ "乌{wu1}": 7,
+ "乌{wu4}": 8,
+ "乐{lao4}": 9,
+ "乐{le4}": 10,
+ "乐{le5}": 11,
+ "乐{yao4}": 12,
+ "乐{yve4}": 13,
+ "了{le5}": 14,
+ "了{liao3}": 15,
+ "了{liao5}": 16,
+ "什{shen2}": 17,
+ "什{shi2}": 18,
+ "仔{zai3}": 19,
+ "仔{zai5}": 20,
+ "仔{zi3}": 21,
+ "仔{zi5}": 22,
+ "令{ling2}": 23,
+ "令{ling4}": 24,
+ "任{ren2}": 25,
+ "任{ren4}": 26,
+ "会{hui4}": 27,
+ "会{hui5}": 28,
+ "会{kuai4}": 29,
+ "传{chuan2}": 30,
+ "传{zhuan4}": 31,
+ "佛{fo2}": 32,
+ "佛{fu2}": 33,
+ "供{gong1}": 34,
+ "供{gong4}": 35,
+ "便{bian4}": 36,
+ "便{pian2}": 37,
+ "倒{dao3}": 38,
+ "倒{dao4}": 39,
+ "假{jia3}": 40,
+ "假{jia4}": 41,
+ "兴{xing1}": 42,
+ "兴{xing4}": 43,
+ "冠{guan1}": 44,
+ "冠{guan4}": 45,
+ "冲{chong1}": 46,
+ "冲{chong4}": 47,
+ "几{ji1}": 48,
+ "几{ji2}": 49,
+ "几{ji3}": 50,
+ "分{fen1}": 51,
+ "分{fen4}": 52,
+ "分{fen5}": 53,
+ "切{qie1}": 54,
+ "切{qie4}": 55,
+ "划{hua2}": 56,
+ "划{hua4}": 57,
+ "划{hua5}": 58,
+ "创{chuang1}": 59,
+ "创{chuang4}": 60,
+ "剥{bao1}": 61,
+ "剥{bo1}": 62,
+ "勒{le4}": 63,
+ "勒{le5}": 64,
+ "勒{lei1}": 65,
+ "区{ou1}": 66,
+ "区{qu1}": 67,
+ "华{hua2}": 68,
+ "华{hua4}": 69,
+ "单{chan2}": 70,
+ "单{dan1}": 71,
+ "单{shan4}": 72,
+ "卜{bo5}": 73,
+ "卜{bu3}": 74,
+ "占{zhan1}": 75,
+ "占{zhan4}": 76,
+ "卡{ka2}": 77,
+ "卡{ka3}": 78,
+ "卡{qia3}": 79,
+ "卷{jvan3}": 80,
+ "卷{jvan4}": 81,
+ "厦{sha4}": 82,
+ "厦{xia4}": 83,
+ "参{can1}": 84,
+ "参{cen1}": 85,
+ "参{shen1}": 86,
+ "发{fa1}": 87,
+ "发{fa4}": 88,
+ "发{fa5}": 89,
+ "只{zhi1}": 90,
+ "只{zhi3}": 91,
+ "号{hao2}": 92,
+ "号{hao4}": 93,
+ "号{hao5}": 94,
+ "同{tong2}": 95,
+ "同{tong4}": 96,
+ "同{tong5}": 97,
+ "吐{tu2}": 98,
+ "吐{tu3}": 99,
+ "吐{tu4}": 100,
+ "和{he2}": 101,
+ "和{he4}": 102,
+ "和{he5}": 103,
+ "和{huo2}": 104,
+ "和{huo4}": 105,
+ "和{huo5}": 106,
+ "喝{he1}": 107,
+ "喝{he4}": 108,
+ "圈{jvan4}": 109,
+ "圈{qvan1}": 110,
+ "圈{qvan5}": 111,
+ "地{de5}": 112,
+ "地{di4}": 113,
+ "地{di5}": 114,
+ "塞{sai1}": 115,
+ "塞{sai2}": 116,
+ "塞{sai4}": 117,
+ "塞{se4}": 118,
+ "壳{ke2}": 119,
+ "壳{qiao4}": 120,
+ "处{chu3}": 121,
+ "处{chu4}": 122,
+ "奇{ji1}": 123,
+ "奇{qi2}": 124,
+ "奔{ben1}": 125,
+ "奔{ben4}": 126,
+ "好{hao3}": 127,
+ "好{hao4}": 128,
+ "好{hao5}": 129,
+ "宁{ning2}": 130,
+ "宁{ning4}": 131,
+ "宁{ning5}": 132,
+ "宿{su4}": 133,
+ "宿{xiu3}": 134,
+ "宿{xiu4}": 135,
+ "将{jiang1}": 136,
+ "将{jiang4}": 137,
+ "少{shao3}": 138,
+ "少{shao4}": 139,
+ "尽{jin3}": 140,
+ "尽{jin4}": 141,
+ "岗{gang1}": 142,
+ "岗{gang3}": 143,
+ "差{cha1}": 144,
+ "差{cha4}": 145,
+ "差{chai1}": 146,
+ "差{ci1}": 147,
+ "巷{hang4}": 148,
+ "巷{xiang4}": 149,
+ "帖{tie1}": 150,
+ "帖{tie3}": 151,
+ "帖{tie4}": 152,
+ "干{gan1}": 153,
+ "干{gan4}": 154,
+ "应{ying1}": 155,
+ "应{ying4}": 156,
+ "应{ying5}": 157,
+ "度{du4}": 158,
+ "度{du5}": 159,
+ "度{duo2}": 160,
+ "弹{dan4}": 161,
+ "弹{tan2}": 162,
+ "弹{tan5}": 163,
+ "强{jiang4}": 164,
+ "强{qiang2}": 165,
+ "强{qiang3}": 166,
+ "当{dang1}": 167,
+ "当{dang4}": 168,
+ "当{dang5}": 169,
+ "待{dai1}": 170,
+ "待{dai4}": 171,
+ "得{de2}": 172,
+ "得{de5}": 173,
+ "得{dei3}": 174,
+ "得{dei5}": 175,
+ "恶{e3}": 176,
+ "恶{e4}": 177,
+ "恶{wu4}": 178,
+ "扁{bian3}": 179,
+ "扁{pian1}": 180,
+ "扇{shan1}": 181,
+ "扇{shan4}": 182,
+ "扎{za1}": 183,
+ "扎{zha1}": 184,
+ "扎{zha2}": 185,
+ "扫{sao3}": 186,
+ "扫{sao4}": 187,
+ "担{dan1}": 188,
+ "担{dan4}": 189,
+ "担{dan5}": 190,
+ "挑{tiao1}": 191,
+ "挑{tiao3}": 192,
+ "据{jv1}": 193,
+ "据{jv4}": 194,
+ "撒{sa1}": 195,
+ "撒{sa3}": 196,
+ "撒{sa5}": 197,
+ "教{jiao1}": 198,
+ "教{jiao4}": 199,
+ "散{san3}": 200,
+ "散{san4}": 201,
+ "散{san5}": 202,
+ "数{shu3}": 203,
+ "数{shu4}": 204,
+ "数{shu5}": 205,
+ "斗{dou3}": 206,
+ "斗{dou4}": 207,
+ "晃{huang3}": 208,
+ "曝{bao4}": 209,
+ "曲{qu1}": 210,
+ "曲{qu3}": 211,
+ "更{geng1}": 212,
+ "更{geng4}": 213,
+ "曾{ceng1}": 214,
+ "曾{ceng2}": 215,
+ "曾{zeng1}": 216,
+ "朝{chao2}": 217,
+ "朝{zhao1}": 218,
+ "朴{piao2}": 219,
+ "朴{pu2}": 220,
+ "朴{pu3}": 221,
+ "杆{gan1}": 222,
+ "杆{gan3}": 223,
+ "查{cha2}": 224,
+ "查{zha1}": 225,
+ "校{jiao4}": 226,
+ "校{xiao4}": 227,
+ "模{mo2}": 228,
+ "模{mu2}": 229,
+ "横{heng2}": 230,
+ "横{heng4}": 231,
+ "没{mei2}": 232,
+ "没{mo4}": 233,
+ "泡{pao1}": 234,
+ "泡{pao4}": 235,
+ "泡{pao5}": 236,
+ "济{ji3}": 237,
+ "济{ji4}": 238,
+ "混{hun2}": 239,
+ "混{hun3}": 240,
+ "混{hun4}": 241,
+ "混{hun5}": 242,
+ "漂{piao1}": 243,
+ "漂{piao3}": 244,
+ "漂{piao4}": 245,
+ "炸{zha2}": 246,
+ "炸{zha4}": 247,
+ "熟{shou2}": 248,
+ "熟{shu2}": 249,
+ "燕{yan1}": 250,
+ "燕{yan4}": 251,
+ "片{pian1}": 252,
+ "片{pian4}": 253,
+ "率{lv4}": 254,
+ "率{shuai4}": 255,
+ "畜{chu4}": 256,
+ "畜{xu4}": 257,
+ "的{de5}": 258,
+ "的{di1}": 259,
+ "的{di2}": 260,
+ "的{di4}": 261,
+ "的{di5}": 262,
+ "盛{cheng2}": 263,
+ "盛{sheng4}": 264,
+ "相{xiang1}": 265,
+ "相{xiang4}": 266,
+ "相{xiang5}": 267,
+ "省{sheng3}": 268,
+ "省{xing3}": 269,
+ "看{kan1}": 270,
+ "看{kan4}": 271,
+ "看{kan5}": 272,
+ "着{zhao1}": 273,
+ "着{zhao2}": 274,
+ "着{zhao5}": 275,
+ "着{zhe5}": 276,
+ "着{zhuo2}": 277,
+ "着{zhuo5}": 278,
+ "矫{jiao3}": 279,
+ "禁{jin1}": 280,
+ "禁{jin4}": 281,
+ "种{zhong3}": 282,
+ "种{zhong4}": 283,
+ "称{chen4}": 284,
+ "称{cheng1}": 285,
+ "空{kong1}": 286,
+ "空{kong4}": 287,
+ "答{da1}": 288,
+ "答{da2}": 289,
+ "粘{nian2}": 290,
+ "粘{zhan1}": 291,
+ "糊{hu2}": 292,
+ "糊{hu5}": 293,
+ "系{ji4}": 294,
+ "系{xi4}": 295,
+ "系{xi5}": 296,
+ "累{lei2}": 297,
+ "累{lei3}": 298,
+ "累{lei4}": 299,
+ "累{lei5}": 300,
+ "纤{qian4}": 301,
+ "纤{xian1}": 302,
+ "结{jie1}": 303,
+ "结{jie2}": 304,
+ "结{jie5}": 305,
+ "给{gei3}": 306,
+ "给{gei5}": 307,
+ "给{ji3}": 308,
+ "缝{feng2}": 309,
+ "缝{feng4}": 310,
+ "缝{feng5}": 311,
+ "肖{xiao1}": 312,
+ "肖{xiao4}": 313,
+ "背{bei1}": 314,
+ "背{bei4}": 315,
+ "脏{zang1}": 316,
+ "脏{zang4}": 317,
+ "舍{she3}": 318,
+ "舍{she4}": 319,
+ "色{se4}": 320,
+ "色{shai3}": 321,
+ "落{lao4}": 322,
+ "落{luo4}": 323,
+ "蒙{meng1}": 324,
+ "蒙{meng2}": 325,
+ "蒙{meng3}": 326,
+ "薄{bao2}": 327,
+ "薄{bo2}": 328,
+ "薄{bo4}": 329,
+ "藏{cang2}": 330,
+ "藏{zang4}": 331,
+ "血{xie3}": 332,
+ "血{xue4}": 333,
+ "行{hang2}": 334,
+ "行{hang5}": 335,
+ "行{heng5}": 336,
+ "行{xing2}": 337,
+ "行{xing4}": 338,
+ "要{yao1}": 339,
+ "要{yao4}": 340,
+ "观{guan1}": 341,
+ "观{guan4}": 342,
+ "觉{jiao4}": 343,
+ "觉{jiao5}": 344,
+ "觉{jve2}": 345,
+ "角{jiao3}": 346,
+ "角{jve2}": 347,
+ "解{jie3}": 348,
+ "解{jie4}": 349,
+ "解{xie4}": 350,
+ "说{shui4}": 351,
+ "说{shuo1}": 352,
+ "调{diao4}": 353,
+ "调{tiao2}": 354,
+ "踏{ta1}": 355,
+ "踏{ta4}": 356,
+ "车{che1}": 357,
+ "车{jv1}": 358,
+ "转{zhuan3}": 359,
+ "转{zhuan4}": 360,
+ "载{zai3}": 361,
+ "载{zai4}": 362,
+ "还{hai2}": 363,
+ "还{huan2}": 364,
+ "遂{sui2}": 365,
+ "遂{sui4}": 366,
+ "都{dou1}": 367,
+ "都{du1}": 368,
+ "重{chong2}": 369,
+ "重{zhong4}": 370,
+ "量{liang2}": 371,
+ "量{liang4}": 372,
+ "量{liang5}": 373,
+ "钻{zuan1}": 374,
+ "钻{zuan4}": 375,
+ "铺{pu1}": 376,
+ "铺{pu4}": 377,
+ "长{chang2}": 378,
+ "长{chang3}": 379,
+ "长{zhang3}": 380,
+ "间{jian1}": 381,
+ "间{jian4}": 382,
+ "降{jiang4}": 383,
+ "降{xiang2}": 384,
+ "难{nan2}": 385,
+ "难{nan4}": 386,
+ "难{nan5}": 387,
+ "露{lou4}": 388,
+ "露{lu4}": 389,
+ "鲜{xian1}": 390,
+ "鲜{xian3}": 391
+}

src/YingMusicSinger/utils/f5_tts/g2p/sources/g2p_chinese_model/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

src/YingMusicSinger/utils/f5_tts/g2p/sources/pinyin_2_bpmf.txt ADDED Viewed

	@@ -0,0 +1,429 @@

+a	ㄚ
+ai	ㄞ
+an	ㄢ
+ang	ㄤ
+ao	ㄠ
+ba	ㄅㄚ
+bai	ㄅㄞ
+ban	ㄅㄢ
+bang	ㄅㄤ
+bao	ㄅㄠ
+bei	ㄅㄟ
+ben	ㄅㄣ
+beng	ㄅㄥ
+bi	ㄅㄧ
+bian	ㄅㄧㄢ
+biang	ㄅㄧㄤ
+biao	ㄅㄧㄠ
+bie	ㄅㄧㄝ
+bin	ㄅㄧㄣ
+bing	ㄅㄧㄥ
+bo	ㄅㄛ
+bu	ㄅㄨ
+ca	ㄘㄚ
+cai	ㄘㄞ
+can	ㄘㄢ
+cang	ㄘㄤ
+cao	ㄘㄠ
+ce	ㄘㄜ
+cen	ㄘㄣ
+ceng	ㄘㄥ
+cha	ㄔㄚ
+chai	ㄔㄞ
+chan	ㄔㄢ
+chang	ㄔㄤ
+chao	ㄔㄠ
+che	ㄔㄜ
+chen	ㄔㄣ
+cheng	ㄔㄥ
+chi	ㄔ
+chong	ㄔㄨㄥ
+chou	ㄔㄡ
+chu	ㄔㄨ
+chua	ㄔㄨㄚ
+chuai	ㄔㄨㄞ
+chuan	ㄔㄨㄢ
+chuang	ㄔㄨㄤ
+chui	ㄔㄨㄟ
+chun	ㄔㄨㄣ
+chuo	ㄔㄨㄛ
+ci	ㄘ
+cong	ㄘㄨㄥ
+cou	ㄘㄡ
+cu	ㄘㄨ
+cuan	ㄘㄨㄢ
+cui	ㄘㄨㄟ
+cun	ㄘㄨㄣ
+cuo	ㄘㄨㄛ
+da	ㄉㄚ
+dai	ㄉㄞ
+dan	ㄉㄢ
+dang	ㄉㄤ
+dao	ㄉㄠ
+de	ㄉㄜ
+dei	ㄉㄟ
+den	ㄉㄣ
+deng	ㄉㄥ
+di	ㄉㄧ
+dia	ㄉㄧㄚ
+dian	ㄉㄧㄢ
+diao	ㄉㄧㄠ
+die	ㄉㄧㄝ
+din	ㄉㄧㄣ
+ding	ㄉㄧㄥ
+diu	ㄉㄧㄡ
+dong	ㄉㄨㄥ
+dou	ㄉㄡ
+du	ㄉㄨ
+duan	ㄉㄨㄢ
+dui	ㄉㄨㄟ
+dun	ㄉㄨㄣ
+duo	ㄉㄨㄛ
+e	ㄜ
+ei	ㄟ
+en	ㄣ
+eng	ㄥ
+er	ㄦ
+fa	ㄈㄚ
+fan	ㄈㄢ
+fang	ㄈㄤ
+fei	ㄈㄟ
+fen	ㄈㄣ
+feng	ㄈㄥ
+fo	ㄈㄛ
+fou	ㄈㄡ
+fu	ㄈㄨ
+ga	ㄍㄚ
+gai	ㄍㄞ
+gan	ㄍㄢ
+gang	ㄍㄤ
+gao	ㄍㄠ
+ge	ㄍㄜ
+gei	ㄍㄟ
+gen	ㄍㄣ
+geng	ㄍㄥ
+gong	ㄍㄨㄥ
+gou	ㄍㄡ
+gu	ㄍㄨ
+gua	ㄍㄨㄚ
+guai	ㄍㄨㄞ
+guan	ㄍㄨㄢ
+guang	ㄍㄨㄤ
+gui	ㄍㄨㄟ
+gun	ㄍㄨㄣ
+guo	ㄍㄨㄛ
+ha	ㄏㄚ
+hai	ㄏㄞ
+han	ㄏㄢ
+hang	ㄏㄤ
+hao	ㄏㄠ
+he	ㄏㄜ
+hei	ㄏㄟ
+hen	ㄏㄣ
+heng	ㄏㄥ
+hm	ㄏㄇ
+hong	ㄏㄨㄥ
+hou	ㄏㄡ
+hu	ㄏㄨ
+hua	ㄏㄨㄚ
+huai	ㄏㄨㄞ
+huan	ㄏㄨㄢ
+huang	ㄏㄨㄤ
+hui	ㄏㄨㄟ
+hun	ㄏㄨㄣ
+huo	ㄏㄨㄛ
+ji	ㄐㄧ
+jia	ㄐㄧㄚ
+jian	ㄐㄧㄢ
+jiang	ㄐㄧㄤ
+jiao	ㄐㄧㄠ
+jie	ㄐㄧㄝ
+jin	ㄐㄧㄣ
+jing	ㄐㄧㄥ
+jiong	ㄐㄩㄥ
+jiu	ㄐㄧㄡ
+ju	ㄐㄩ
+jv	ㄐㄩ
+juan	ㄐㄩㄢ
+jvan	ㄐㄩㄢ
+jue	ㄐㄩㄝ
+jve	ㄐㄩㄝ
+jun	ㄐㄩㄣ
+ka	ㄎㄚ
+kai	ㄎㄞ
+kan	ㄎㄢ
+kang	ㄎㄤ
+kao	ㄎㄠ
+ke	ㄎㄜ
+kei	ㄎㄟ
+ken	ㄎㄣ
+keng	ㄎㄥ
+kong	ㄎㄨㄥ
+kou	ㄎㄡ
+ku	ㄎㄨ
+kua	ㄎㄨㄚ
+kuai	ㄎㄨㄞ
+kuan	ㄎㄨㄢ
+kuang	ㄎㄨㄤ
+kui	ㄎㄨㄟ
+kun	ㄎㄨㄣ
+kuo	ㄎㄨㄛ
+la	ㄌㄚ
+lai	ㄌㄞ
+lan	ㄌㄢ
+lang	ㄌㄤ
+lao	ㄌㄠ
+le	ㄌㄜ
+lei	ㄌㄟ
+leng	ㄌㄥ
+li	ㄌㄧ
+lia	ㄌㄧㄚ
+lian	ㄌㄧㄢ
+liang	ㄌㄧㄤ
+liao	ㄌㄧㄠ
+lie	ㄌㄧㄝ
+lin	ㄌㄧㄣ
+ling	ㄌㄧㄥ
+liu	ㄌㄧㄡ
+lo	ㄌㄛ
+long	ㄌㄨㄥ
+lou	ㄌㄡ
+lu	ㄌㄨ
+luan	ㄌㄨㄢ
+lue	ㄌㄩㄝ
+lun	ㄌㄨㄣ
+luo	ㄌㄨㄛ
+lv	ㄌㄩ
+lve	ㄌㄩㄝ
+m	ㄇㄨ
+ma	ㄇㄚ
+mai	ㄇㄞ
+man	ㄇㄢ
+mang	ㄇㄤ
+mao	ㄇㄠ
+me	ㄇㄜ
+mei	ㄇㄟ
+men	ㄇㄣ
+meng	ㄇㄥ
+mi	ㄇㄧ
+mian	ㄇㄧㄢ
+miao	ㄇㄧㄠ
+mie	ㄇㄧㄝ
+min	ㄇㄧㄣ
+ming	ㄇㄧㄥ
+miu	ㄇㄧㄡ
+mo	ㄇㄛ
+mou	ㄇㄡ
+mu	ㄇㄨ
+n	ㄣ
+na	ㄋㄚ
+nai	ㄋㄞ
+nan	ㄋㄢ
+nang	ㄋㄤ
+nao	ㄋㄠ
+ne	ㄋㄜ
+nei	ㄋㄟ
+nen	ㄋㄣ
+neng	ㄋㄥ
+ng	ㄣ
+ni	ㄋㄧ
+nian	ㄋㄧㄢ
+niang	ㄋㄧㄤ
+niao	ㄋㄧㄠ
+nie	ㄋㄧㄝ
+nin	ㄋㄧㄣ
+ning	ㄋㄧㄥ
+niu	ㄋㄧㄡ
+nong	ㄋㄨㄥ
+nou	ㄋㄡ
+nu	ㄋㄨ
+nuan	ㄋㄨㄢ
+nue	ㄋㄩㄝ
+nun	ㄋㄨㄣ
+nuo	ㄋㄨㄛ
+nv	ㄋㄩ
+nve	ㄋㄩㄝ
+o	ㄛ
+ou	ㄡ
+pa	ㄆㄚ
+pai	ㄆㄞ
+pan	ㄆㄢ
+pang	ㄆㄤ
+pao	ㄆㄠ
+pei	ㄆㄟ
+pen	ㄆㄣ
+peng	ㄆㄥ
+pi	ㄆㄧ
+pian	ㄆㄧㄢ
+piao	ㄆㄧㄠ
+pie	ㄆㄧㄝ
+pin	ㄆㄧㄣ
+ping	ㄆㄧㄥ
+po	ㄆㄛ
+pou	ㄆㄡ
+pu	ㄆㄨ
+qi	ㄑㄧ
+qia	ㄑㄧㄚ
+qian	ㄑㄧㄢ
+qiang	ㄑㄧㄤ
+qiao	ㄑㄧㄠ
+qie	ㄑㄧㄝ
+qin	ㄑㄧㄣ
+qing	ㄑㄧㄥ
+qiong	ㄑㄩㄥ
+qiu	ㄑㄧㄡ
+qu	ㄑㄩ
+quan	ㄑㄩㄢ
+qvan	ㄑㄩㄢ
+que	ㄑㄩㄝ
+qun	ㄑㄩㄣ
+ran	ㄖㄢ
+rang	ㄖㄤ
+rao	ㄖㄠ
+re	ㄖㄜ
+ren	ㄖㄣ
+reng	ㄖㄥ
+ri	ㄖ
+rong	ㄖㄨㄥ
+rou	ㄖㄡ
+ru	ㄖㄨ
+rua	ㄖㄨㄚ
+ruan	ㄖㄨㄢ
+rui	ㄖㄨㄟ
+run	ㄖㄨㄣ
+ruo	ㄖㄨㄛ
+sa	ㄙㄚ
+sai	ㄙㄞ
+san	ㄙㄢ
+sang	ㄙㄤ
+sao	ㄙㄠ
+se	ㄙㄜ
+sen	ㄙㄣ
+seng	ㄙㄥ
+sha	ㄕㄚ
+shai	ㄕㄞ
+shan	ㄕㄢ
+shang	ㄕㄤ
+shao	ㄕㄠ
+she	ㄕㄜ
+shei	ㄕㄟ
+shen	ㄕㄣ
+sheng	ㄕㄥ
+shi	ㄕ
+shou	ㄕㄡ
+shu	ㄕㄨ
+shua	ㄕㄨㄚ
+shuai	ㄕㄨㄞ
+shuan	ㄕㄨㄢ
+shuang	ㄕㄨㄤ
+shui	ㄕㄨㄟ
+shun	ㄕㄨㄣ
+shuo	ㄕㄨㄛ
+si	ㄙ
+song	ㄙㄨㄥ
+sou	ㄙㄡ
+su	ㄙㄨ
+suan	ㄙㄨㄢ
+sui	ㄙㄨㄟ
+sun	ㄙㄨㄣ
+suo	ㄙㄨㄛ
+ta	ㄊㄚ
+tai	ㄊㄞ
+tan	ㄊㄢ
+tang	ㄊㄤ
+tao	ㄊㄠ
+te	ㄊㄜ
+tei	ㄊㄟ
+teng	ㄊㄥ
+ti	ㄊㄧ
+tian	ㄊㄧㄢ
+tiao	ㄊㄧㄠ
+tie	ㄊㄧㄝ
+ting	ㄊㄧㄥ
+tong	ㄊㄨㄥ
+tou	ㄊㄡ
+tsuo	ㄘㄨㄛ
+tu	ㄊㄨ
+tuan	ㄊㄨㄢ
+tui	ㄊㄨㄟ
+tun	ㄊㄨㄣ
+tuo	ㄊㄨㄛ
+tzan	ㄗㄢ
+wa	ㄨㄚ
+wai	ㄨㄞ
+wan	ㄨㄢ
+wang	ㄨㄤ
+wei	ㄨㄟ
+wen	ㄨㄣ
+weng	ㄨㄥ
+wo	ㄨㄛ
+wong	ㄨㄥ
+wu	ㄨ
+xi	ㄒㄧ
+xia	ㄒㄧㄚ
+xian	ㄒㄧㄢ
+xiang	ㄒㄧㄤ
+xiao	ㄒㄧㄠ
+xie	ㄒㄧㄝ
+xin	ㄒㄧㄣ
+xing	ㄒㄧㄥ
+xiong	ㄒㄩㄥ
+xiu	ㄒㄧㄡ
+xu	ㄒㄩ
+xuan	ㄒㄩㄢ
+xue	ㄒㄩㄝ
+xun	ㄒㄩㄣ
+ya	ㄧㄚ
+yai	ㄧㄞ
+yan	ㄧㄢ
+yang	ㄧㄤ
+yao	ㄧㄠ
+ye	ㄧㄝ
+yi	ㄧ
+yin	ㄧㄣ
+ying	ㄧㄥ
+yo	ㄧㄛ
+yong	ㄩㄥ
+you	ㄧㄡ
+yu	ㄩ
+yuan	ㄩㄢ
+yue	ㄩㄝ
+yve	ㄩㄝ
+yun	ㄩㄣ
+za	ㄗㄚ
+zai	ㄗㄞ
+zan	ㄗㄢ
+zang	ㄗㄤ
+zao	ㄗㄠ
+ze	ㄗㄜ
+zei	ㄗㄟ
+zen	ㄗㄣ
+zeng	ㄗㄥ
+zha	ㄓㄚ
+zhai	ㄓㄞ
+zhan	ㄓㄢ
+zhang	ㄓㄤ
+zhao	ㄓㄠ
+zhe	ㄓㄜ
+zhei	ㄓㄟ
+zhen	ㄓㄣ
+zheng	ㄓㄥ
+zhi	ㄓ
+zhong	ㄓㄨㄥ
+zhou	ㄓㄡ
+zhu	ㄓㄨ
+zhua	ㄓㄨㄚ
+zhuai	ㄓㄨㄞ
+zhuan	ㄓㄨㄢ
+zhuang	ㄓㄨㄤ
+zhui	ㄓㄨㄟ
+zhun	ㄓㄨㄣ
+zhuo	ㄓㄨㄛ
+zi	ㄗ
+zong	ㄗㄨㄥ
+zou	ㄗㄡ
+zu	ㄗㄨ
+zuan	ㄗㄨㄢ
+zui	ㄗㄨㄟ
+zun	ㄗㄨㄣ
+zuo	ㄗㄨㄛ

src/YingMusicSinger/utils/f5_tts/g2p/utils/front_utils.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+def generate_poly_lexicon(file_path: str):
+    """Generate poly char lexicon for Mandarin Chinese."""
+    poly_dict = {}
+    with open(file_path, "r", encoding="utf-8") as readf:
+        txt_list = readf.readlines()
+        for txt in txt_list:
+            word = txt.strip("\n")
+            if word not in poly_dict:
+                poly_dict[word] = 1
+        readf.close()
+    return poly_dict

src/YingMusicSinger/utils/f5_tts/g2p/utils/g2p.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+from typing import List, Union
+from phonemizer.backend import EspeakBackend
+from phonemizer.separator import Separator
+from phonemizer.utils import list2str, str2list
+# separator=Separator(phone=' ', word=' _ ', syllable='|'),
+separator = Separator(word=" _ ", syllable="|", phone=" ")
+phonemizer_zh = EspeakBackend(
+    "cmn", preserve_punctuation=False, with_stress=False, language_switch="remove-flags"
+)
+# phonemizer_zh.separator = separator
+phonemizer_en = EspeakBackend(
+    "en-us",
+    preserve_punctuation=False,
+    with_stress=False,
+    language_switch="remove-flags",
+)
+# phonemizer_en.separator = separator
+phonemizer_ja = EspeakBackend(
+    "ja", preserve_punctuation=False, with_stress=False, language_switch="remove-flags"
+)
+# phonemizer_ja.separator = separator
+phonemizer_ko = EspeakBackend(
+    "ko", preserve_punctuation=False, with_stress=False, language_switch="remove-flags"
+)
+# phonemizer_ko.separator = separator
+phonemizer_fr = EspeakBackend(
+    "fr-fr",
+    preserve_punctuation=False,
+    with_stress=False,
+    language_switch="remove-flags",
+)
+# phonemizer_fr.separator = separator
+phonemizer_de = EspeakBackend(
+    "de", preserve_punctuation=False, with_stress=False, language_switch="remove-flags"
+)
+# phonemizer_de.separator = separator
+lang2backend = {
+    "zh": phonemizer_zh,
+    "ja": phonemizer_ja,
+    "en": phonemizer_en,
+    "fr": phonemizer_fr,
+    "ko": phonemizer_ko,
+    "de": phonemizer_de,
+}
+with open("./src/YingMusicSinger/utils/f5_tts/g2p/utils/mls_en.json", "r") as f:
+    json_data = f.read()
+token = json.loads(json_data)
+def phonemizer_g2p(text, language):
+    langbackend = lang2backend[language]
+    phonemes = _phonemize(
+        langbackend,
+        text,
+        separator,
+        strip=True,
+        njobs=1,
+        prepend_text=False,
+        preserve_empty_lines=False,
+    )
+    token_id = []
+    if isinstance(phonemes, list):
+        for phone in phonemes:
+            phonemes_split = phone.split(" ")
+            token_id.append([token[p] for p in phonemes_split if p in token])
+    else:
+        phonemes_split = phonemes.split(" ")
+        token_id = [token[p] for p in phonemes_split if p in token]
+    return phonemes, token_id
+def _phonemize(  # pylint: disable=too-many-arguments
+    backend,
+    text: Union[str, List[str]],
+    separator: Separator,
+    strip: bool,
+    njobs: int,
+    prepend_text: bool,
+    preserve_empty_lines: bool,
+):
+    """Auxiliary function to phonemize()
+    Does the phonemization and returns the phonemized text. Raises a
+    RuntimeError on error.
+    """
+    # remember the text type for output (either list or string)
+    text_type = type(text)
+    # force the text as a list
+    text = [line.strip(os.linesep) for line in str2list(text)]
+    # if preserving empty lines, note the index of each empty line
+    if preserve_empty_lines:
+        empty_lines = [n for n, line in enumerate(text) if not line.strip()]
+    # ignore empty lines
+    text = [line for line in text if line.strip()]
+    if text:
+        # phonemize the text
+        phonemized = backend.phonemize(
+            text, separator=separator, strip=strip, njobs=njobs
+        )
+    else:
+        phonemized = []
+    # if preserving empty lines, reinsert them into text and phonemized lists
+    if preserve_empty_lines:
+        for i in empty_lines:  # noqa
+            if prepend_text:
+                text.insert(i, "")
+            phonemized.insert(i, "")
+    # at that point, the phonemized text is a list of str. Format it as
+    # expected by the parameters
+    if prepend_text:
+        return list(zip(text, phonemized))
+    if text_type == str:
+        return list2str(phonemized)
+    return phonemized

src/YingMusicSinger/utils/f5_tts/g2p/utils/log.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) 2024 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+import logging
+__all__ = [
+    "logger",
+]
+class Logger(object):
+    def __init__(self, name: str = None):
+        name = "PaddleSpeech" if not name else name
+        self.logger = logging.getLogger(name)
+        log_config = {
+            "DEBUG": 10,
+            "INFO": 20,
+            "TRAIN": 21,
+            "EVAL": 22,
+            "WARNING": 30,
+            "ERROR": 40,
+            "CRITICAL": 50,
+            "EXCEPTION": 100,
+        }
+        for key, level in log_config.items():
+            logging.addLevelName(level, key)
+            if key == "EXCEPTION":
+                self.__dict__[key.lower()] = self.logger.exception
+            else:
+                self.__dict__[key.lower()] = functools.partial(self.__call__, level)
+        self.format = logging.Formatter(
+            fmt="[%(asctime)-15s] [%(levelname)8s] - %(message)s"
+        )
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+        self.logger.addHandler(self.handler)
+        self.logger.setLevel(logging.INFO)
+        self.logger.propagate = False
+    def __call__(self, log_level: str, msg: str):
+        self.logger.log(log_level, msg)
+logger = Logger()

src/YingMusicSinger/utils/f5_tts/g2p/utils/mls_en.json ADDED Viewed

	@@ -0,0 +1,335 @@

+{
+	"[UNK]": 0,
+	"_": 1,
+	"b": 2,
+	"d": 3,
+	"f": 4,
+	"h": 5,
+	"i": 6,
+	"j": 7,
+	"k": 8,
+	"l": 9,
+	"m": 10,
+	"n": 11,
+	"p": 12,
+	"r": 13,
+	"s": 14,
+	"t": 15,
+	"v": 16,
+	"w": 17,
+	"x": 18,
+	"z": 19,
+	"æ": 20,
+	"ç": 21,
+	"ð": 22,
+	"ŋ": 23,
+	"ɐ": 24,
+	"ɔ": 25,
+	"ə": 26,
+	"ɚ": 27,
+	"ɛ": 28,
+	"ɡ": 29,
+	"ɪ": 30,
+	"ɬ": 31,
+	"ɹ": 32,
+	"ɾ": 33,
+	"ʃ": 34,
+	"ʊ": 35,
+	"ʌ": 36,
+	"ʒ": 37,
+	"ʔ": 38,
+	"θ": 39,
+	"ᵻ": 40,
+	"aɪ": 41,
+	"aʊ": 42,
+	"dʒ": 43,
+	"eɪ": 44,
+	"iə": 45,
+	"iː": 46,
+	"n̩": 47,
+	"oʊ": 48,
+	"oː": 49,
+	"tʃ": 50,
+	"uː": 51,
+	"ææ": 52,
+	"ɐɐ": 53,
+	"ɑː": 54,
+	"ɑ̃": 55,
+	"ɔɪ": 56,
+	"ɔː": 57,
+	"ɔ̃": 58,
+	"əl": 59,
+	"ɛɹ": 60,
+	"ɜː": 61,
+	"ɡʲ": 62,
+	"ɪɹ": 63,
+	"ʊɹ": 64,
+	"aɪə": 65,
+	"aɪɚ": 66,
+	"iːː": 67,
+	"oːɹ": 68,
+	"ɑːɹ": 69,
+	"ɔːɹ": 70,
+	"1": 71,
+	"a": 72,
+	"e": 73,
+	"o": 74,
+	"q": 75,
+	"u": 76,
+	"y": 77,
+	"ɑ": 78,
+	"ɒ": 79,
+	"ɕ": 80,
+	"ɣ": 81,
+	"ɫ": 82,
+	"ɯ": 83,
+	"ʐ": 84,
+	"ʲ": 85,
+	"a1": 86,
+	"a2": 87,
+	"a5": 88,
+	"ai": 89,
+	"aɜ": 90,
+	"aː": 91,
+	"ei": 92,
+	"eə": 93,
+	"i.": 94,
+	"i1": 95,
+	"i2": 96,
+	"i5": 97,
+	"io": 98,
+	"iɑ": 99,
+	"iɛ": 100,
+	"iɜ": 101,
+	"i̪": 102,
+	"kh": 103,
+	"nʲ": 104,
+	"o1": 105,
+	"o2": 106,
+	"o5": 107,
+	"ou": 108,
+	"oɜ": 109,
+	"ph": 110,
+	"s.": 111,
+	"th": 112,
+	"ts": 113,
+	"tɕ": 114,
+	"u1": 115,
+	"u2": 116,
+	"u5": 117,
+	"ua": 118,
+	"uo": 119,
+	"uə": 120,
+	"uɜ": 121,
+	"y1": 122,
+	"y2": 123,
+	"y5": 124,
+	"yu": 125,
+	"yæ": 126,
+	"yə": 127,
+	"yɛ": 128,
+	"yɜ": 129,
+	"ŋɜ": 130,
+	"ŋʲ": 131,
+	"ɑ1": 132,
+	"ɑ2": 133,
+	"ɑ5": 134,
+	"ɑu": 135,
+	"ɑɜ": 136,
+	"ɑʲ": 137,
+	"ə1": 138,
+	"ə2": 139,
+	"ə5": 140,
+	"ər": 141,
+	"əɜ": 142,
+	"əʊ": 143,
+	"ʊə": 144,
+	"ai1": 145,
+	"ai2": 146,
+	"ai5": 147,
+	"aiɜ": 148,
+	"ei1": 149,
+	"ei2": 150,
+	"ei5": 151,
+	"eiɜ": 152,
+	"i.1": 153,
+	"i.2": 154,
+	"i.5": 155,
+	"i.ɜ": 156,
+	"io5": 157,
+	"iou": 158,
+	"iɑ1": 159,
+	"iɑ2": 160,
+	"iɑ5": 161,
+	"iɑɜ": 162,
+	"iɛ1": 163,
+	"iɛ2": 164,
+	"iɛ5": 165,
+	"iɛɜ": 166,
+	"i̪1": 167,
+	"i̪2": 168,
+	"i̪5": 169,
+	"i̪ɜ": 170,
+	"onɡ": 171,
+	"ou1": 172,
+	"ou2": 173,
+	"ou5": 174,
+	"ouɜ": 175,
+	"ts.": 176,
+	"tsh": 177,
+	"tɕh": 178,
+	"u5ʲ": 179,
+	"ua1": 180,
+	"ua2": 181,
+	"ua5": 182,
+	"uai": 183,
+	"uaɜ": 184,
+	"uei": 185,
+	"uo1": 186,
+	"uo2": 187,
+	"uo5": 188,
+	"uoɜ": 189,
+	"uə1": 190,
+	"uə2": 191,
+	"uə5": 192,
+	"uəɜ": 193,
+	"yiɜ": 194,
+	"yu2": 195,
+	"yu5": 196,
+	"yæ2": 197,
+	"yæ5": 198,
+	"yæɜ": 199,
+	"yə2": 200,
+	"yə5": 201,
+	"yəɜ": 202,
+	"yɛ1": 203,
+	"yɛ2": 204,
+	"yɛ5": 205,
+	"yɛɜ": 206,
+	"ɑu1": 207,
+	"ɑu2": 208,
+	"ɑu5": 209,
+	"ɑuɜ": 210,
+	"ər1": 211,
+	"ər2": 212,
+	"ər5": 213,
+	"ərɜ": 214,
+	"əː1": 215,
+	"iou1": 216,
+	"iou2": 217,
+	"iou5": 218,
+	"iouɜ": 219,
+	"onɡ1": 220,
+	"onɡ2": 221,
+	"onɡ5": 222,
+	"onɡɜ": 223,
+	"ts.h": 224,
+	"uai2": 225,
+	"uai5": 226,
+	"uaiɜ": 227,
+	"uei1": 228,
+	"uei2": 229,
+	"uei5": 230,
+	"ueiɜ": 231,
+	"uoɜʲ": 232,
+	"yɛ5ʲ": 233,
+	"ɑu2ʲ": 234,
+	"2": 235,
+	"5": 236,
+	"ɜ": 237,
+	"ʂ": 238,
+	"dʑ": 239,
+	"iɪ": 240,
+	"uɪ": 241,
+	"xʲ": 242,
+	"ɑt": 243,
+	"ɛɜ": 244,
+	"ɛː": 245,
+	"ɪː": 246,
+	"phʲ": 247,
+	"ɑ5ʲ": 248,
+	"ɑuʲ": 249,
+	"ərə": 250,
+	"uozʰ": 251,
+	"ər1ʲ": 252,
+	"tɕhtɕh": 253,
+	"c": 254,
+	"ʋ": 255,
+	"ʍ": 256,
+	"ʑ": 257,
+	"ː": 258,
+	"aə": 259,
+	"eː": 260,
+	"hʲ": 261,
+	"iʊ": 262,
+	"kʲ": 263,
+	"lʲ": 264,
+	"oə": 265,
+	"oɪ": 266,
+	"oʲ": 267,
+	"pʲ": 268,
+	"sʲ": 269,
+	"u4": 270,
+	"uʲ": 271,
+	"yi": 272,
+	"yʲ": 273,
+	"ŋ2": 274,
+	"ŋ5": 275,
+	"ŋ̩": 276,
+	"ɑɪ": 277,
+	"ɑʊ": 278,
+	"ɕʲ": 279,
+	"ət": 280,
+	"əə": 281,
+	"əɪ": 282,
+	"əʲ": 283,
+	"ɛ1": 284,
+	"ɛ5": 285,
+	"aiə": 286,
+	"aiɪ": 287,
+	"azʰ": 288,
+	"eiə": 289,
+	"eiɪ": 290,
+	"eiʊ": 291,
+	"i.ə": 292,
+	"i.ɪ": 293,
+	"i.ʊ": 294,
+	"ioɜ": 295,
+	"izʰ": 296,
+	"iɑə": 297,
+	"iɑʊ": 298,
+	"iɑʲ": 299,
+	"iɛə": 300,
+	"iɛɪ": 301,
+	"iɛʊ": 302,
+	"i̪ə": 303,
+	"i̪ʊ": 304,
+	"khʲ": 305,
+	"ouʲ": 306,
+	"tsʲ": 307,
+	"u2ʲ": 308,
+	"uoɪ": 309,
+	"uzʰ": 310,
+	"uɜʲ": 311,
+	"yæɪ": 312,
+	"yəʊ": 313,
+	"ərt": 314,
+	"ərɪ": 315,
+	"ərʲ": 316,
+	"əːt": 317,
+	"iouə": 318,
+	"iouʊ": 319,
+	"iouʲ": 320,
+	"iɛzʰ": 321,
+	"onɡə": 322,
+	"onɡɪ": 323,
+	"onɡʊ": 324,
+	"ouzʰ": 325,
+	"uai1": 326,
+	"ueiɪ": 327,
+	"ɑuzʰ": 328,
+	"iouzʰ": 329
+}

src/YingMusicSinger/utils/f5_tts/thirdparty/LangSegment/LangSegment.py ADDED Viewed

	@@ -0,0 +1,1251 @@

+"""
+This file bundles language identification functions.
+Modifications (fork): Copyright (c) 2021, Adrien Barbaresi.
+Original code: Copyright (c) 2011 Marco Lui <saffsd@gmail.com>.
+Based on research by Marco Lui and Tim Baldwin.
+See LICENSE file for more info.
+https://github.com/adbar/py3langid
+Projects:
+https://github.com/juntaosun/LangSegment
+"""
+import re
+from collections import Counter, defaultdict
+import numpy as np
+# import langid
+# import py3langid as langid
+# pip install py3langid==0.2.2
+# 启用语言预测概率归一化，概率预测的分数。因此，实现重新规范化 产生 0-1 范围内的输出。
+# langid disables probability normalization by default. For command-line usages of , it can be enabled by passing the flag.
+# For probability normalization in library use, the user must instantiate their own . An example of such usage is as follows:
+from py3langid.langid import MODEL_FILE, LanguageIdentifier
+langid = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
+# Digital processing
+try:
+    from src.YingMusicSinger.utils.f5_tts.thirdparty.LangSegment.utils.num import (
+        num2str,
+    )
+except ImportError:
+    try:
+        from thirdparty.LangSegment.utils.num import num2str
+    except ImportError as e:
+        raise e
+# -----------------------------------
+# 更新日志：新版本分词更加精准。
+# Changelog: The new version of the word segmentation is more accurate.
+# チェンジログ:新しいバージョンの単語セグメンテーションはより正確です。
+# Changelog: 분할이라는 단어의 새로운 버전이 더 정확합니다.
+# -----------------------------------
+# Word segmentation function:
+# automatically identify and split the words (Chinese/English/Japanese/Korean) in the article or sentence according to different languages,
+# making it more suitable for TTS processing.
+# This code is designed for front-end text multi-lingual mixed annotation distinction, multi-language mixed training and inference of various TTS projects.
+# This processing result is mainly for (Chinese = zh, Japanese = ja, English = en, Korean = ko), and can actually support up to 97 different language mixing processing.
+# ===========================================================================================================
+# 分かち書き機能:文章や文章の中の例えば（中国語/英語/日本語/韓国語）を、異なる言語で自動的に認識して分割し、TTS処理により適したものにします。
+# このコードは、さまざまなTTSプロジェクトのフロントエンドテキストの多言語混合注釈区別、多言語混合トレーニング、および推論のために特別に作成されています。
+# ===========================================================================================================
+# (1)自動分詞:「韓国語では何を読むのですかあなたの体育の先生は誰ですか?今回の発表会では、iPhone 15シリーズの4機種が登場しました」
+# （2）手动分词:“あなたの名前は<ja>佐々木ですか?<ja>ですか?”
+# この処理結果は主に（中国語=ja、日本語=ja、英語=en、韓国語=ko）を対象としており、実際には最大97の異なる言語の混合処理をサポートできます。
+# ===========================================================================================================
+# ===========================================================================================================
+# 단어 분할 기능: 기사 또는 문장에서 단어(중국어/영어/일본어/한국어)를 다른 언어에 따라 자동으로 식별하고 분할하여 TTS 처리에 더 적합합니다.
+# 이 코드는 프런트 엔드 텍스트 다국어 혼합 주석 분화, 다국어 혼합 교육 및 다양한 TTS 프로젝트의 추론을 위해 설계되었습니다.
+# ===========================================================================================================
+# (1) 자동 단어 분할: "한국어로 무엇을 읽습니까? 스포츠 씨? 이 컨퍼런스는 4개의 iPhone 15 시리즈 모델을 제공합니다."
+# (2) 수동 참여: "이름이 <ja>Saki입니까? <ja>?"
+# 이 처리 결과는 주로 (중국어 = zh, 일본어 = ja, 영어 = en, 한국어 = ko)를 위한 것이며 실제로 혼합 처리를 위해 최대 97개의 언어를 지원합니다.
+# ===========================================================================================================
+# ===========================================================================================================
+# 分词功能：将文章或句子里的例如（中/英/日/韩），按不同语言自动识别并拆分，让它更适合TTS处理。
+# 本代码专为各种 TTS 项目的前端文本多语种混合标注区分，多语言混合训练和推理而编写。
+# ===========================================================================================================
+# （1）自动分词：“韩语中的오빠读什么呢？あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型”
+# （2）手动分词：“你的名字叫<ja>佐々木？<ja>吗？”
+# 本处理结果主要针对（中文=zh , 日文=ja , 英文=en , 韩语=ko）, 实际上可支持多达 97 种不同的语言混合处理。
+# ===========================================================================================================
+# 手动分词标签规范：<语言标签>文本内容</语言标签>
+# 수동 단어 분할 태그 사양: <언어 태그> 텍스트 내용</언어 태그>
+# Manual word segmentation tag specification: <language tags> text content </language tags>
+# 手動分詞タグ仕様:<言語タグ>テキスト内容</言語タグ>
+# ===========================================================================================================
+# For manual word segmentation, labels need to appear in pairs, such as:
+# 如需手动分词，标签需要成对出现，例如：“<ja>佐々木<ja>”  或者  “<ja>佐々木</ja>”
+# 错误示范：“你的名字叫<ja>佐々木。” 此句子中出现的单个<ja>标签将被忽略，不会处理。
+# Error demonstration: "Your name is <ja>佐々木。" Single <ja> tags that appear in this sentence will be ignored and will not be processed.
+# ===========================================================================================================
+# ===========================================================================================================
+# 语音合成标记语言 SSML , 这里只支持它的标签（非 XML）Speech Synthesis Markup Language SSML, only its tags are supported here (not XML)
+# 想支持更多的 SSML 标签？欢迎 PR！ Want to support more SSML tags? PRs are welcome!
+# 说明：除了中文以外，它也可改造成支持多语种 SSML ，不仅仅是中文。
+# Note: In addition to Chinese, it can also be modified to support multi-language SSML, not just Chinese.
+# ===========================================================================================================
+# 中文实现：Chinese implementation:
+# 【SSML】<number>=中文大写数字读法（单字）
+# 【SSML】<telephone>=数字转成中文电话号码大写汉字（单字）
+# 【SSML】<currency>=按金额发音。
+# 【SSML】<date>=按日期发音。支持 2024年08月24, 2024/8/24, 2024-08, 08-24, 24 等输入。
+# ===========================================================================================================
+class LangSSML:
+    # 纯数字
+    _zh_numerals_number = {
+        "0": "零",
+        "1": "一",
+        "2": "二",
+        "3": "三",
+        "4": "四",
+        "5": "五",
+        "6": "六",
+        "7": "七",
+        "8": "八",
+        "9": "九",
+    }
+    # 将2024/8/24, 2024-08, 08-24, 24 标准化“年月日”
+    # Standardize 2024/8/24, 2024-08, 08-24, 24 to "year-month-day"
+    def _format_chinese_data(date_str: str):
+        # 处理日期格式
+        input_date = date_str
+        if date_str is None or date_str.strip() == "":
+            return ""
+        date_str = re.sub(r"[\/\._|年|月]", "-", date_str)
+        date_str = re.sub(r"日", r"", date_str)
+        date_arrs = date_str.split(" ")
+        if len(date_arrs) == 1 and ":" in date_arrs[0]:
+            time_str = date_arrs[0]
+            date_arrs = []
+        else:
+            time_str = date_arrs[1] if len(date_arrs) >= 2 else ""
+        def nonZero(num, cn, func=None):
+            if func is not None:
+                num = func(num)
+            return f"{num}{cn}" if num is not None and num != "" and num != "0" else ""
+        f_number = LangSSML.to_chinese_number
+        f_currency = LangSSML.to_chinese_currency
+        # year, month, day
+        year_month_day = ""
+        if len(date_arrs) > 0:
+            year, month, day = "", "", ""
+            parts = date_arrs[0].split("-")
+            if len(parts) == 3:  # 格式为 YYYY-MM-DD
+                year, month, day = parts
+            elif len(parts) == 2:  # 格式为 MM-DD 或 YYYY-MM
+                if len(parts[0]) == 4:  # 年-月
+                    year, month = parts
+                else:
+                    month, day = parts  # 月-日
+            elif len(parts[0]) > 0:  # 仅有月-日或年
+                if len(parts[0]) == 4:
+                    year = parts[0]
+                else:
+                    day = parts[0]
+            year, month, day = (
+                nonZero(year, "年", f_number),
+                nonZero(month, "月", f_currency),
+                nonZero(day, "日", f_currency),
+            )
+            year_month_day = re.sub(r"([年|月|日])+", r"\1", f"{year}{month}{day}")
+        # hours, minutes, seconds
+        time_str = re.sub(r"[\/\.\-：_]", ":", time_str)
+        time_arrs = time_str.split(":")
+        hours, minutes, seconds = "", "", ""
+        if len(time_arrs) == 3:  # H/M/S
+            hours, minutes, seconds = time_arrs
+        elif len(time_arrs) == 2:  # H/M
+            hours, minutes = time_arrs
+        elif len(time_arrs[0]) > 0:
+            hours = f"{time_arrs[0]}点"  # H
+        if len(time_arrs) > 1:
+            hours, minutes, seconds = (
+                nonZero(hours, "点", f_currency),
+                nonZero(minutes, "分", f_currency),
+                nonZero(seconds, "秒", f_currency),
+            )
+        hours_minutes_seconds = re.sub(
+            r"([点|分|秒])+", r"\1", f"{hours}{minutes}{seconds}"
+        )
+        output_date = f"{year_month_day}{hours_minutes_seconds}"
+        return output_date
+    # 【SSML】number=中文大写数字读法（单字）
+    # Chinese Numbers(single word)
+    def to_chinese_number(num: str):
+        pattern = r"(\d+)"
+        zh_numerals = LangSSML._zh_numerals_number
+        arrs = re.split(pattern, num)
+        output = ""
+        for item in arrs:
+            if re.match(pattern, item):
+                output += "".join(
+                    zh_numerals[digit] if digit in zh_numerals else ""
+                    for digit in str(item)
+                )
+            else:
+                output += item
+        output = output.replace(".", "点")
+        return output
+    # 【SSML】telephone=数字转成中文电话号码大写汉字（单字）
+    # Convert numbers to Chinese phone numbers in uppercase Chinese characters(single word)
+    def to_chinese_telephone(num: str):
+        output = LangSSML.to_chinese_number(num.replace("+86", ""))  # zh +86
+        output = output.replace("一", "幺")
+        return output
+    # 【SSML】currency=按金额发音。
+    # Digital processing from GPT_SoVITS num.py （thanks）
+    def to_chinese_currency(num: str):
+        pattern = r"(\d+)"
+        arrs = re.split(pattern, num)
+        output = ""
+        for item in arrs:
+            if re.match(pattern, item):
+                output += num2str(item)
+            else:
+                output += item
+        output = output.replace(".", "点")
+        return output
+    # 【SSML】date=按日期发音。支持 2024年08月24, 2024/8/24, 2024-08, 08-24, 24 等输入。
+    def to_chinese_date(num: str):
+        chinese_date = LangSSML._format_chinese_data(num)
+        return chinese_date
+class LangSegment:
+    _text_cache = None
+    _text_lasts = None
+    _text_langs = None
+    _lang_count = None
+    _lang_eos = None
+    # 可自定义语言匹配标签：カスタマイズ可能な言語対応タグ:사용자 지정 가능한 언어 일치 태그:
+    # Customizable language matching tags: These are supported，이 표현들은 모두 지지합니다
+    # <zh>你好<zh> , <ja>佐々木</ja> , <en>OK<en> , <ko>오빠</ko> 这些写法均支持
+    SYMBOLS_PATTERN = r"(<([a-zA-Z|-]*)>(.*?)<\/*[a-zA-Z|-]*>)"
+    # 语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
+    # 언어 필터 그룹 기능을 사용하면 예약된 언어를 지정할 수 있습니다. 필터 그룹에 없는 언어는 지워집니다. TTS 텍스트에서 지원하는 언어를 원하는 대로 일치시킬 수 있습니다.
+    # 言語フィルターグループ機能では、予約言語を指定できます。フィルターグループに含まれていない言語はクリアされます。TTS音声合成がサポートする言語を自由に組み合わせることができます。
+    # The language filter group function allows you to specify reserved languages.
+    # Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.
+    # 排名越前，优先级越高，The higher the ranking, the higher the priority，ランキングが上位になるほど、優先度が高くなります。
+    # 系统默认过滤器。System default filter。(ISO 639-1 codes given)
+    # ----------------------------------------------------------------------------------------------------------------------------------
+    # "zh"中文=Chinese ,"en"英语=English ,"ja"日语=Japanese ,"ko"韩语=Korean ,"fr"法语=French ,"vi"越南语=Vietnamese , "ru"俄语=Russian
+    # "th"泰语=Thai
+    # ----------------------------------------------------------------------------------------------------------------------------------
+    DEFAULT_FILTERS = ["zh", "ja", "ko", "en"]
+    # 用户可自定义过滤器。User-defined filters
+    Langfilters = DEFAULT_FILTERS[:]  # 创建副本
+    # 合并文本
+    isLangMerge = True
+    # 试验性支持：您可自定义添加："fr"法语 , "vi"越南语。Experimental: You can customize to add: "fr" French, "vi" Vietnamese.
+    # 请使用API启用：LangSegment.setfilters(["zh", "en", "ja", "ko", "fr", "vi" , "ru" , "th"]) # 您可自定义添加，如："fr"法语 , "vi"越南语。
+    # 预览版功能，自动启用或禁用，无需设置
+    # Preview feature, automatically enabled or disabled, no settings required
+    EnablePreview = False
+    # 除此以外，它支持简写过滤器，只需按不同语种任意组合即可。
+    # In addition to that, it supports abbreviation filters, allowing for any combination of different languages.
+    # 示例：您可以任意指定多种组合，进行过滤
+    # Example: You can specify any combination to filter
+    # 中/日语言优先级阀值（评分范围为 0 ~ 1）:评分低于设定阀值 <0.89 时，启用 filters 中的优先级。\n
+    # 중/일본어 우선 순위 임계값(점수 범위 0-1): 점수가 설정된 임계값 <0.89보다 낮을 때 필터에서 우선 순위를 활성화합니다.
+    # 中国語/日本語の優先度しきい値（スコア範囲0〜1）:スコアが設定されたしきい値<0.89未満の場合、フィルターの優先度が有効になります。\n
+    # Chinese and Japanese language priority threshold (score range is 0 ~ 1): The default threshold is 0.89.  \n
+    # Only the common characters between Chinese and Japanese are processed with confidence and priority. \n
+    LangPriorityThreshold = 0.89
+    # Langfilters = ["zh"]              # 按中文识别
+    # Langfilters = ["en"]              # 按英文识别
+    # Langfilters = ["ja"]              # 按日文识别
+    # Langfilters = ["ko"]              # 按韩文识别
+    # Langfilters = ["zh_ja"]           # 中日混合识别
+    # Langfilters = ["zh_en"]           # 中英混合识别
+    # Langfilters = ["ja_en"]           # 日英混合识别
+    # Langfilters = ["zh_ko"]           # 中韩混合识别
+    # Langfilters = ["ja_ko"]           # 日韩混合识别
+    # Langfilters = ["en_ko"]           # 英韩混合识别
+    # Langfilters = ["zh_ja_en"]        # 中日英混合识别
+    # Langfilters = ["zh_ja_en_ko"]     # 中日英韩混合识别
+    # 更多过滤组合，请您随意。。。For more filter combinations, please feel free to......
+    # より多くのフィルターの組み合わせ、お気軽に。。。더 많은 필터 조합을 원하시면 자유롭게 해주세요. .....
+    # 可选保留：支持中文数字拼音格式，更方便前端实现拼音音素修改和推理，默认关闭 False 。
+    # 开启后 True ，括号内的数字拼音格式均保留，并识别输出为："zh"中文。
+    keepPinyin = False
+    # DEFINITION
+    PARSE_TAG = re.compile(r"(⑥\$*\d+[\d]{6,}⑥)")
+    @staticmethod
+    def _clears():
+        LangSegment._text_cache = None
+        LangSegment._text_lasts = None
+        LangSegment._text_langs = None
+        LangSegment._text_waits = None
+        LangSegment._lang_count = None
+        LangSegment._lang_eos = None
+        pass
+    @staticmethod
+    def _is_english_word(word):
+        return bool(re.match(r"^[a-zA-Z]+$", word))
+    @staticmethod
+    def _is_chinese(word):
+        for char in word:
+            if "\u4e00" <= char <= "\u9fff":
+                return True
+        return False
+    @staticmethod
+    def _is_japanese_kana(word):
+        pattern = re.compile(r"[\u3040-\u309F\u30A0-\u30FF]+")
+        matches = pattern.findall(word)
+        return len(matches) > 0
+    @staticmethod
+    def _insert_english_uppercase(word):
+        modified_text = re.sub(r"(?<!\b)([A-Z])", r" \1", word)
+        modified_text = modified_text.strip("-")
+        return modified_text + " "
+    @staticmethod
+    def _split_camel_case(word):
+        return re.sub(r"(?<!^)(?=[A-Z])", " ", word)
+    @staticmethod
+    def _statistics(language, text):
+        # Language word statistics:
+        # Chinese characters usually occupy double bytes
+        if LangSegment._lang_count is None or not isinstance(
+            LangSegment._lang_count, defaultdict
+        ):
+            LangSegment._lang_count = defaultdict(int)
+        lang_count = LangSegment._lang_count
+        if "|" not in language:
+            lang_count[language] += (
+                int(len(text) * 2) if language == "zh" else len(text)
+            )
+        LangSegment._lang_count = lang_count
+        pass
+    @staticmethod
+    def _clear_text_number(text):
+        if text == "\n":
+            return text, False  # Keep Line Breaks
+        clear_text = re.sub(r"([^\w\s]+)", "", re.sub(r"\n+", "", text)).strip()
+        is_number = len(re.sub(re.compile(r"(\d+)"), "", clear_text)) == 0
+        return clear_text, is_number
+    @staticmethod
+    def _saveData(words, language: str, text: str, score: float, symbol=None):
+        # Pre-detection
+        clear_text, is_number = LangSegment._clear_text_number(text)
+        # Merge the same language and save the results
+        preData = words[-1] if len(words) > 0 else None
+        if symbol is not None:
+            pass
+        elif preData is not None and preData["symbol"] is None:
+            if len(clear_text) == 0:
+                language = preData["lang"]
+            elif is_number == True:
+                language = preData["lang"]
+            _, pre_is_number = LangSegment._clear_text_number(preData["text"])
+            if preData["lang"] == language:
+                LangSegment._statistics(preData["lang"], text)
+                text = preData["text"] + text
+                preData["text"] = text
+                return preData
+            elif pre_is_number == True:
+                text = f"{preData['text']}{text}"
+                words.pop()
+        elif is_number == True:
+            priority_language = LangSegment._get_filters_string()[:2]
+            if priority_language in "ja-zh-en-ko-fr-vi":
+                language = priority_language
+        data = {"lang": language, "text": text, "score": score, "symbol": symbol}
+        filters = LangSegment.Langfilters
+        if (
+            filters is None
+            or len(filters) == 0
+            or "?" in language
+            or language in filters
+            or language in filters[0]
+            or filters[0] == "*"
+            or filters[0] in "alls-mixs-autos"
+        ):
+            words.append(data)
+            LangSegment._statistics(data["lang"], data["text"])
+        return data
+    @staticmethod
+    def _addwords(words, language, text, score, symbol=None):
+        if text == "\n":
+            pass  # Keep Line Breaks
+        elif text is None or len(text.strip()) == 0:
+            return True
+        if language is None:
+            language = ""
+        language = language.lower()
+        if language == "en":
+            text = LangSegment._insert_english_uppercase(text)
+        # text = re.sub(r'[(（）)]', ',' , text) # Keep it.
+        text_waits = LangSegment._text_waits
+        ispre_waits = len(text_waits) > 0
+        preResult = text_waits.pop() if ispre_waits else None
+        if preResult is None:
+            preResult = words[-1] if len(words) > 0 else None
+        if preResult and ("|" in preResult["lang"]):
+            pre_lang = preResult["lang"]
+            if language in pre_lang:
+                preResult["lang"] = language = language.split("|")[0]
+            else:
+                preResult["lang"] = pre_lang.split("|")[0]
+            if ispre_waits:
+                preResult = LangSegment._saveData(
+                    words,
+                    preResult["lang"],
+                    preResult["text"],
+                    preResult["score"],
+                    preResult["symbol"],
+                )
+        pre_lang = preResult["lang"] if preResult else None
+        if ("|" in language) and (
+            pre_lang and pre_lang not in language and "…" not in language
+        ):
+            language = language.split("|")[0]
+        if "|" in language:
+            LangSegment._text_waits.append(
+                {"lang": language, "text": text, "score": score, "symbol": symbol}
+            )
+        else:
+            LangSegment._saveData(words, language, text, score, symbol)
+        return False
+    @staticmethod
+    def _get_prev_data(words):
+        data = words[-1] if words and len(words) > 0 else None
+        if data:
+            return (data["lang"], data["text"])
+        return (None, "")
+    @staticmethod
+    def _match_ending(input, index):
+        if input is None or len(input) == 0:
+            return False, None
+        input = re.sub(r"\s+", "", input)
+        if len(input) == 0 or abs(index) > len(input):
+            return False, None
+        ending_pattern = re.compile(r'([「」“”‘’"\':：。.！!?．？])')
+        return ending_pattern.match(input[index]), input[index]
+    @staticmethod
+    def _cleans_text(cleans_text):
+        cleans_text = re.sub(r"(.*?)([^\w]+)", r"\1 ", cleans_text)
+        cleans_text = re.sub(r"(.)\1+", r"\1", cleans_text)
+        return cleans_text.strip()
+    @staticmethod
+    def _mean_processing(text: str):
+        if text is None or (text.strip()) == "":
+            return None, 0.0
+        arrs = LangSegment._split_camel_case(text).split(" ")
+        langs = []
+        for t in arrs:
+            if len(t.strip()) <= 3:
+                continue
+            language, score = langid.classify(t)
+            langs.append({"lang": language})
+        if len(langs) == 0:
+            return None, 0.0
+        return Counter([item["lang"] for item in langs]).most_common(1)[0][0], 1.0
+    @staticmethod
+    def _lang_classify(cleans_text):
+        language, score = langid.classify(cleans_text)
+        # fix: Huggingface is np.float32
+        if (
+            score is not None
+            and isinstance(score, np.generic)
+            and hasattr(score, "item")
+        ):
+            score = score.item()
+        score = round(score, 3)
+        return language, score
+    @staticmethod
+    def _get_filters_string():
+        filters = LangSegment.Langfilters
+        return "-".join(filters).lower().strip() if filters is not None else ""
+    @staticmethod
+    def _parse_language(words, segment):
+        LANG_JA = "ja"
+        LANG_ZH = "zh"
+        LANG_ZH_JA = f"{LANG_ZH}|{LANG_JA}"
+        LANG_JA_ZH = f"{LANG_JA}|{LANG_ZH}"
+        language = LANG_ZH
+        regex_pattern = re.compile(r"([^\w\s]+)")
+        lines = regex_pattern.split(segment)
+        lines_max = len(lines)
+        LANG_EOS = LangSegment._lang_eos
+        for index, text in enumerate(lines):
+            if len(text) == 0:
+                continue
+            EOS = index >= (lines_max - 1)
+            nextId = index + 1
+            nextText = lines[nextId] if not EOS else ""
+            nextPunc = (
+                len(re.sub(regex_pattern, "", re.sub(r"\n+", "", nextText)).strip())
+                == 0
+            )
+            textPunc = (
+                len(re.sub(regex_pattern, "", re.sub(r"\n+", "", text)).strip()) == 0
+            )
+            if not EOS and (
+                textPunc == True or (len(nextText.strip()) >= 0 and nextPunc == True)
+            ):
+                lines[nextId] = f"{text}{nextText}"
+                continue
+            number_tags = re.compile(r"(⑥\d{6,}⑥)")
+            cleans_text = re.sub(number_tags, "", text)
+            cleans_text = re.sub(r"\d+", "", cleans_text)
+            cleans_text = LangSegment._cleans_text(cleans_text)
+            # fix:Langid's recognition of short sentences is inaccurate, and it is spliced longer.
+            if not EOS and len(cleans_text) <= 2:
+                lines[nextId] = f"{text}{nextText}"
+                continue
+            language, score = LangSegment._lang_classify(cleans_text)
+            prev_language, prev_text = LangSegment._get_prev_data(words)
+            if language != LANG_ZH and all(
+                "\u4e00" <= c <= "\u9fff" for c in re.sub(r"\s", "", cleans_text)
+            ):
+                language, score = LANG_ZH, 1
+            if len(cleans_text) <= 5 and LangSegment._is_chinese(cleans_text):
+                filters_string = LangSegment._get_filters_string()
+                if (
+                    score < LangSegment.LangPriorityThreshold
+                    and len(filters_string) > 0
+                ):
+                    index_ja, index_zh = (
+                        filters_string.find(LANG_JA),
+                        filters_string.find(LANG_ZH),
+                    )
+                    if index_ja != -1 and index_ja < index_zh:
+                        language = LANG_JA
+                    elif index_zh != -1 and index_zh < index_ja:
+                        language = LANG_ZH
+                if LangSegment._is_japanese_kana(cleans_text):
+                    language = LANG_JA
+                elif len(cleans_text) > 2 and score > 0.90:
+                    pass
+                elif EOS and LANG_EOS:
+                    language = LANG_ZH if len(cleans_text) <= 1 else language
+                else:
+                    LANG_UNKNOWN = (
+                        LANG_ZH_JA
+                        if language == LANG_ZH
+                        or (len(cleans_text) <= 2 and prev_language == LANG_ZH)
+                        else LANG_JA_ZH
+                    )
+                    match_end, match_char = LangSegment._match_ending(text, -1)
+                    referen = (
+                        prev_language in LANG_UNKNOWN or LANG_UNKNOWN in prev_language
+                        if prev_language
+                        else False
+                    )
+                    if match_char in "。.":
+                        language = (
+                            prev_language if referen and len(words) > 0 else language
+                        )
+                    else:
+                        language = f"{LANG_UNKNOWN}|…"
+            text, *_ = re.subn(number_tags, LangSegment._restore_number, text)
+            LangSegment._addwords(words, language, text, score)
+            pass
+        pass
+    # ----------------------------------------------------------
+    # 【SSML】中文数字处理：Chinese Number Processing (SSML support)
+    # 这里默认都是中文，用于处理 SSML 中文标签。当然可以支持任意语言，例如：
+    # The default here is Chinese, which is used to process SSML Chinese tags. Of course, any language can be supported, for example:
+    # 中文电话号码：<telephone>1234567</telephone>
+    # 中文数字号码：<number>1234567</number>
+    @staticmethod
+    def _process_symbol_SSML(words, data):
+        tag, match = data
+        language = SSML = match[1]
+        text = match[2]
+        score = 1.0
+        if SSML == "telephone":
+            # 中文-电话号码
+            language = "zh"
+            text = LangSSML.to_chinese_telephone(text)
+            pass
+        elif SSML == "number":
+            # 中文-数字读法
+            language = "zh"
+            text = LangSSML.to_chinese_number(text)
+            pass
+        elif SSML == "currency":
+            # 中文-按金额发音
+            language = "zh"
+            text = LangSSML.to_chinese_currency(text)
+            pass
+        elif SSML == "date":
+            # 中文-按金额发音
+            language = "zh"
+            text = LangSSML.to_chinese_date(text)
+            pass
+        LangSegment._addwords(words, language, text, score, SSML)
+        pass
+    # ----------------------------------------------------------
+    @staticmethod
+    def _restore_number(matche):
+        value = matche.group(0)
+        text_cache = LangSegment._text_cache
+        if value in text_cache:
+            process, data = text_cache[value]
+            tag, match = data
+            value = match
+        return value
+    @staticmethod
+    def _pattern_symbols(item, text):
+        if text is None:
+            return text
+        tag, pattern, process = item
+        matches = pattern.findall(text)
+        if len(matches) == 1 and "".join(matches[0]) == text:
+            return text
+        for i, match in enumerate(matches):
+            key = f"⑥{tag}{i:06d}⑥"
+            text = re.sub(pattern, key, text, count=1)
+            LangSegment._text_cache[key] = (process, (tag, match))
+        return text
+    @staticmethod
+    def _process_symbol(words, data):
+        tag, match = data
+        language = match[1]
+        text = match[2]
+        score = 1.0
+        filters = LangSegment._get_filters_string()
+        if language not in filters:
+            LangSegment._process_symbol_SSML(words, data)
+        else:
+            LangSegment._addwords(words, language, text, score, True)
+        pass
+    @staticmethod
+    def _process_english(words, data):
+        tag, match = data
+        text = match[0]
+        filters = LangSegment._get_filters_string()
+        priority_language = filters[:2]
+        # Preview feature, other language segmentation processing
+        enablePreview = LangSegment.EnablePreview
+        if enablePreview == True:
+            # Experimental: Other language support
+            regex_pattern = re.compile(r"(.*?[。.?？!！]+[\n]{,1})")
+            lines = regex_pattern.split(text)
+            for index, text in enumerate(lines):
+                if len(text.strip()) == 0:
+                    continue
+                cleans_text = LangSegment._cleans_text(text)
+                language, score = LangSegment._lang_classify(cleans_text)
+                if language not in filters:
+                    language, score = LangSegment._mean_processing(cleans_text)
+                if language is None or score <= 0.0:
+                    continue
+                elif language in filters:
+                    pass  # pass
+                elif score >= 0.95:
+                    continue  # High score, but not in the filter, excluded.
+                elif score <= 0.15 and filters[:2] == "fr":
+                    language = priority_language
+                else:
+                    language = "en"
+                LangSegment._addwords(words, language, text, score)
+        else:
+            # Default is English
+            language, score = "en", 1.0
+            LangSegment._addwords(words, language, text, score)
+        pass
+    @staticmethod
+    def _process_Russian(words, data):
+        tag, match = data
+        text = match[0]
+        language = "ru"
+        score = 1.0
+        LangSegment._addwords(words, language, text, score)
+        pass
+    @staticmethod
+    def _process_Thai(words, data):
+        tag, match = data
+        text = match[0]
+        language = "th"
+        score = 1.0
+        LangSegment._addwords(words, language, text, score)
+        pass
+    @staticmethod
+    def _process_korean(words, data):
+        tag, match = data
+        text = match[0]
+        language = "ko"
+        score = 1.0
+        LangSegment._addwords(words, language, text, score)
+        pass
+    @staticmethod
+    def _process_quotes(words, data):
+        tag, match = data
+        text = "".join(match)
+        childs = LangSegment.PARSE_TAG.findall(text)
+        if len(childs) > 0:
+            LangSegment._process_tags(words, text, False)
+        else:
+            cleans_text = LangSegment._cleans_text(match[1])
+            if len(cleans_text) <= 5:
+                LangSegment._parse_language(words, text)
+            else:
+                language, score = LangSegment._lang_classify(cleans_text)
+                LangSegment._addwords(words, language, text, score)
+        pass
+    @staticmethod
+    def _process_pinyin(words, data):
+        tag, match = data
+        text = match
+        language = "zh"
+        score = 1.0
+        LangSegment._addwords(words, language, text, score)
+        pass
+    @staticmethod
+    def _process_number(words, data):  # "$0" process only
+        """
+        Numbers alone cannot accurately identify language.
+        Because numbers are universal in all languages.
+        So it won't be executed here, just for testing.
+        """
+        tag, match = data
+        language = words[0]["lang"] if len(words) > 0 else "zh"
+        text = match
+        score = 0.0
+        LangSegment._addwords(words, language, text, score)
+        pass
+    @staticmethod
+    def _process_tags(words, text, root_tag):
+        text_cache = LangSegment._text_cache
+        segments = re.split(LangSegment.PARSE_TAG, text)
+        segments_len = len(segments) - 1
+        for index, text in enumerate(segments):
+            if root_tag:
+                LangSegment._lang_eos = index >= segments_len
+            if LangSegment.PARSE_TAG.match(text):
+                process, data = text_cache[text]
+                if process:
+                    process(words, data)
+            else:
+                LangSegment._parse_language(words, text)
+            pass
+        return words
+    @staticmethod
+    def _merge_results(words):
+        new_word = []
+        for index, cur_data in enumerate(words):
+            if "symbol" in cur_data:
+                del cur_data["symbol"]
+            if index == 0:
+                new_word.append(cur_data)
+            else:
+                pre_data = new_word[-1]
+                if cur_data["lang"] == pre_data["lang"]:
+                    pre_data["text"] = f"{pre_data['text']}{cur_data['text']}"
+                else:
+                    new_word.append(cur_data)
+        return new_word
+    @staticmethod
+    def _parse_symbols(text):
+        TAG_NUM = "00"  # "00" => default channels , "$0" => testing channel
+        TAG_S1, TAG_S2, TAG_P1, TAG_P2, TAG_EN, TAG_KO, TAG_RU, TAG_TH = (
+            "$1",
+            "$2",
+            "$3",
+            "$4",
+            "$5",
+            "$6",
+            "$7",
+            "$8",
+        )
+        TAG_BASE = re.compile(r'(([【《（(“‘"\']*[LANGUAGE]+[\W\s]*)+)')
+        # Get custom language filter
+        filters = LangSegment.Langfilters
+        filters = filters if filters is not None else ""
+        # =======================================================================================================
+        # Experimental: Other language support.Thử nghiệm: Hỗ trợ ngôn ngữ khác.Expérimental : prise en charge d’autres langues.
+        # 相关语言字符如有缺失，熟悉相关语言的朋友，可以提交把缺失的发音符号补全。
+        # If relevant language characters are missing, friends who are familiar with the relevant languages can submit a submission to complete the missing pronunciation symbols.
+        # S'il manque des caractères linguistiques pertinents, les amis qui connaissent les langues concernées peuvent soumettre une soumission pour compléter les symboles de prononciation manquants.
+        # Nếu thiếu ký tự ngôn ngữ liên quan, những người bạn quen thuộc với ngôn ngữ liên quan có thể gửi bài để hoàn thành các ký hiệu phát âm còn thiếu.
+        # -------------------------------------------------------------------------------------------------------
+        # Preview feature, other language support
+        enablePreview = LangSegment.EnablePreview
+        if "fr" in filters or "vi" in filters:
+            enablePreview = True
+        LangSegment.EnablePreview = enablePreview
+        # 实验性：法语字符支持。Prise en charge des caractères français
+        RE_FR = "" if not enablePreview else "àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ"
+        # 实验性：越南语字符支持。Hỗ trợ ký tự tiếng Việt
+        RE_VI = (
+            ""
+            if not enablePreview
+            else "đơưăáàảãạắằẳẵặấầẩẫậéèẻẽẹếềểễệíìỉĩịóòỏõọốồổỗộớờởỡợúùủũụứừửữựôâêơưỷỹ"
+        )
+        # -------------------------------------------------------------------------------------------------------
+        # Basic options:
+        process_list = [
+            (
+                TAG_S1,
+                re.compile(LangSegment.SYMBOLS_PATTERN),
+                LangSegment._process_symbol,
+            ),  # Symbol Tag
+            (
+                TAG_KO,
+                re.compile(re.sub(r"LANGUAGE", "\uac00-\ud7a3", TAG_BASE.pattern)),
+                LangSegment._process_korean,
+            ),  # Korean words
+            (
+                TAG_TH,
+                re.compile(re.sub(r"LANGUAGE", "\u0e00-\u0e7f", TAG_BASE.pattern)),
+                LangSegment._process_Thai,
+            ),  # Thai words support.
+            (
+                TAG_RU,
+                re.compile(re.sub(r"LANGUAGE", "А-Яа-яЁё", TAG_BASE.pattern)),
+                LangSegment._process_Russian,
+            ),  # Russian words support.
+            (
+                TAG_NUM,
+                re.compile(r"(\W*\d+\W+\d*\W*\d*)"),
+                LangSegment._process_number,
+            ),  # Number words, Universal in all languages, Ignore it.
+            (
+                TAG_EN,
+                re.compile(
+                    re.sub(r"LANGUAGE", f"a-zA-Z{RE_FR}{RE_VI}", TAG_BASE.pattern)
+                ),
+                LangSegment._process_english,
+            ),  # English words + Other language support.
+            (
+                TAG_P1,
+                re.compile(r'(["\'])(.*?)(\1)'),
+                LangSegment._process_quotes,
+            ),  # Regular quotes
+            (
+                TAG_P2,
+                re.compile(
+                    r"([\n]*[【《（(“‘])([^【《（(“‘’”)）》】]{3,})([’”)）》】][\W\s]*[\n]{,1})"
+                ),
+                LangSegment._process_quotes,
+            ),  # Special quotes, There are left and right.
+        ]
+        # Extended options: Default False
+        if LangSegment.keepPinyin == True:
+            process_list.insert(
+                1,
+                (
+                    TAG_S2,
+                    re.compile(r"([\(（{](?:\s*\w*\d\w*\s*)+[}）\)])"),
+                    LangSegment._process_pinyin,
+                ),  # Chinese Pinyin Tag.
+            )
+        # -------------------------------------------------------------------------------------------------------
+        words = []
+        lines = re.findall(r".*\n*", re.sub(LangSegment.PARSE_TAG, "", text))
+        for index, text in enumerate(lines):
+            if len(text.strip()) == 0:
+                continue
+            LangSegment._lang_eos = False
+            LangSegment._text_cache = {}
+            for item in process_list:
+                text = LangSegment._pattern_symbols(item, text)
+            cur_word = LangSegment._process_tags([], text, True)
+            if len(cur_word) == 0:
+                continue
+            cur_data = cur_word[0] if len(cur_word) > 0 else None
+            pre_data = words[-1] if len(words) > 0 else None
+            if (
+                cur_data
+                and pre_data
+                and cur_data["lang"] == pre_data["lang"]
+                and cur_data["symbol"] == False
+                and pre_data["symbol"]
+            ):
+                cur_data["text"] = f"{pre_data['text']}{cur_data['text']}"
+                words.pop()
+            words += cur_word
+        if LangSegment.isLangMerge == True:
+            words = LangSegment._merge_results(words)
+        lang_count = LangSegment._lang_count
+        if lang_count and len(lang_count) > 0:
+            lang_count = dict(
+                sorted(lang_count.items(), key=lambda x: x[1], reverse=True)
+            )
+            lang_count = list(lang_count.items())
+            LangSegment._lang_count = lang_count
+        return words
+    @staticmethod
+    def setfilters(filters):
+        # 当过滤器更改时，清除缓存
+        # 필터가 변경되면 캐시를 지웁니다.
+        # フィルタが変更されると、キャッシュがクリアされます
+        # When the filter changes, clear the cache
+        if LangSegment.Langfilters != filters:
+            LangSegment._clears()
+            LangSegment.Langfilters = filters
+        pass
+    @staticmethod
+    def getfilters():
+        return LangSegment.Langfilters
+    @staticmethod
+    def setPriorityThreshold(threshold: float):
+        LangSegment.LangPriorityThreshold = threshold
+        pass
+    @staticmethod
+    def getPriorityThreshold():
+        return LangSegment.LangPriorityThreshold
+    @staticmethod
+    def getCounts():
+        lang_count = LangSegment._lang_count
+        if lang_count is not None:
+            return lang_count
+        text_langs = LangSegment._text_langs
+        if text_langs is None or len(text_langs) == 0:
+            return [("zh", 0)]
+        lang_counts = defaultdict(int)
+        for d in text_langs:
+            lang_counts[d["lang"]] += (
+                int(len(d["text"]) * 2) if d["lang"] == "zh" else len(d["text"])
+            )
+        lang_counts = dict(
+            sorted(lang_counts.items(), key=lambda x: x[1], reverse=True)
+        )
+        lang_counts = list(lang_counts.items())
+        LangSegment._lang_count = lang_counts
+        return lang_counts
+    @staticmethod
+    def getTexts(text: str):
+        if text is None or len(text.strip()) == 0:
+            LangSegment._clears()
+            return []
+        # lasts
+        text_langs = LangSegment._text_langs
+        if LangSegment._text_lasts == text and text_langs is not None:
+            return text_langs
+        # parse
+        LangSegment._text_waits = []
+        LangSegment._lang_count = None
+        LangSegment._text_lasts = text
+        text = LangSegment._parse_symbols(text)
+        LangSegment._text_langs = text
+        return text
+    @staticmethod
+    def classify(text: str):
+        return LangSegment.getTexts(text)
+def setLangMerge(value: bool):
+    """是否优化合并结果"""
+    LangSegment.isLangMerge = value
+    pass
+def getLangMerge():
+    """是否优化合并结果"""
+    return LangSegment.isLangMerge
+def setfilters(filters):
+    """
+    功能：语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
+    기능: 언어 필터 그룹 기능, 예약된 언어를 지정할 수 있습니다. 필터 그룹에 없는 언어는 지워집니다. TTS 텍스트에서 지원하는 언어를 원하는 대로 일치시킬 수 있습니다.
+    機能:言語フィルターグループ機能で、予約言語を指定できます。フィルターグループに含まれていない言語はクリアされます。TTS音声合成がサポートする言語を自由に組み合わせることができます。
+    Function: Language filter group function, you can specify reserved languages. \n
+    Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.\n
+    Args:
+        filters (list): ["zh", "en", "ja", "ko"] 排名越前，优先级越高
+    """
+    LangSegment.setfilters(filters)
+    pass
+def getfilters():
+    """
+    功能：语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
+    기능: 언어 필터 그룹 기능, 예약된 언어를 지정할 수 있습니다. 필터 그룹에 없는 언어는 지워집니다. TTS 텍스트에서 지원하는 언어를 원하는 대로 일치시킬 수 있습니다.
+    機能:言語フィルターグループ機能で、予約言語を指定できます。フィルターグループに含まれていない言語はクリアされます。TTS音声合成がサポートする言語を自由に組み合わせることができます。
+    Function: Language filter group function, you can specify reserved languages. \n
+    Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.\n
+    Args:
+        filters (list): ["zh", "en", "ja", "ko"] 排名越前，优先级越高
+    """
+    return LangSegment.getfilters()
+# # @Deprecated：Use shorter setfilters
+# def setLangfilters(filters):
+#     """
+#     >0.1.9废除：使用更简短的setfilters
+#     """
+#     setfilters(filters)
+# # @Deprecated：Use shorter getfilters
+# def getLangfilters():
+#     """
+#     >0.1.9废除：使用更简短的getfilters
+#     """
+#     return getfilters()
+def setKeepPinyin(value: bool):
+    """
+    可选保留：支持中文数字拼音格式，更方便前端实现拼音音素修改和推理，默认关闭 False 。\n
+    开启后 True ，括号内的数字拼音格式均保留，并识别输出为："zh"中文。
+    """
+    LangSegment.keepPinyin = value
+    pass
+def getKeepPinyin():
+    """
+    可选保留：支持中文数字拼音格式，更方便前端实现拼音音素修改和推理，默认关闭 False 。\n
+    开启后 True ，括号内的数字拼音格式均保留，并识别输出为："zh"中文。
+    """
+    return LangSegment.keepPinyin
+def setEnablePreview(value: bool):
+    """
+    启用预览版功能（默认关闭）
+    Enable preview functionality (off by default)
+    Args:
+        value (bool): True=开启， False=关闭
+    """
+    LangSegment.EnablePreview = value == True
+    pass
+def getEnablePreview():
+    """
+    启用预览版功能（默认关闭）
+    Enable preview functionality (off by default)
+    Args:
+        value (bool): True=开启， False=关闭
+    """
+    return LangSegment.EnablePreview == True
+def setPriorityThreshold(threshold: float):
+    """
+    中/日语言优先级阀值（评分范围为 0 ~ 1）:评分低于设定阀值 <0.89 时，启用 filters 中的优先级。\n
+    中国語/日本語の優先度しきい値（スコア範囲0〜1）:スコアが設定されたしきい値<0.89未満の場合、フィルターの優先度が有効になります。\n
+    중/일본어 우선 순위 임계값(점수 범위 0-1): 점수가 설정된 임계값 <0.89보다 낮을 때 필터에서 우선 순위를 활성화합니다.
+    Chinese and Japanese language priority threshold (score range is 0 ~ 1): The default threshold is 0.89.  \n
+    Only the common characters between Chinese and Japanese are processed with confidence and priority. \n
+    Args:
+        threshold:float (score range is 0 ~ 1)
+    """
+    LangSegment.setPriorityThreshold(threshold)
+    pass
+def getPriorityThreshold():
+    """
+    中/日语言优先级阀值（评分范围为 0 ~ 1）:评分低于设定阀值 <0.89 时，启用 filters 中的优先级。\n
+    中国語/日本語の優先度しきい値（スコア範囲0〜1）:スコアが設定されたしきい値<0.89未満の場合、フィルターの優先度が有効になります。\n
+    중/일본어 우선 순위 임계값(점수 범위 0-1): 점수가 설정된 임계값 <0.89보다 낮을 때 필터에서 우선 순위를 활성화합니다.
+    Chinese and Japanese language priority threshold (score range is 0 ~ 1): The default threshold is 0.89.  \n
+    Only the common characters between Chinese and Japanese are processed with confidence and priority. \n
+    Args:
+        threshold:float (score range is 0 ~ 1)
+    """
+    return LangSegment.getPriorityThreshold()
+def getTexts(text: str):
+    """
+    功能：对输入的文本进行多语种分词\n
+    기능: 입력 텍스트의 다국어 분할 \n
+    機能:入力されたテキストの多言語セグメンテーション\n
+    Feature: Tokenizing multilingual text input.\n
+    参数-Args:
+        text (str): Text content,文本内容\n
+    返回-Returns:
+        list: 示例结果：[{'lang':'zh','text':'?'},...]\n
+        lang=语种 , text=内容\n
+    """
+    return LangSegment.getTexts(text)
+def getCounts():
+    """
+    功能：分词结果统计，按语种字数降序，用于确定其主要语言\n
+    기능: 주요 언어를 결정하는 데 사용되는 언어별 단어 수 내림차순으로 단어 분할 결과의 통계 \n
+    機能:主な言語を決定するために使用される、言語の単語数の降順による単語分割結果の統計\n
+    Function: Tokenizing multilingual text input.\n
+    返回-Returns:
+        list: 示例结果：[('zh', 5), ('ja', 2), ('en', 1)] = [(语种,字数含标点)]\n
+    """
+    return LangSegment.getCounts()
+def classify(text: str):
+    """
+    功能：兼容接口实现
+    Function: Compatible interface implementation
+    """
+    return LangSegment.classify(text)
+def printList(langlist):
+    """
+    功能：打印数组结果
+    기능: 어레이 결과 인쇄
+    機能:配列結果を印刷
+    Function: Print array results
+    """
+    print("\n===================【打印结果】===================")
+    if langlist is None or len(langlist) == 0:
+        print("无内容结果,No content result")
+        return
+    for line in langlist:
+        print(line)
+    pass
+def main():
+    # -----------------------------------
+    # 更新日志：新版本分词更加精准。
+    # Changelog: The new version of the word segmentation is more accurate.
+    # チェンジログ:新しいバージョンの単語セグメンテーションはより正確です。
+    # Changelog: 분할이라는 단어의 새로운 버전이 더 정확합니다.
+    # -----------------------------------
+    # 输入示例1：（包含日文，中文）Input Example 1: (including Japanese, Chinese)
+    # text = "“昨日は雨が降った，音楽、映画。。。”你今天学习日语了吗？春は桜の季節です。语种分词是语音合成必不可少的环节。言語分詞は音声合成に欠かせない環節である！"
+    # 输入示例2：（包含日文，中文）Input Example 1: (including Japanese, Chinese)
+    # text = "欢迎来玩。東京，は日本の首都です。欢迎来玩.  太好了!"
+    # 输入示例3：（包含日文，中文）Input Example 1: (including Japanese, Chinese)
+    # text = "明日、私たちは海辺にバカンスに行きます。你会说日语吗：“中国語、話せますか” 你的日语真好啊！"
+    # 输入示例4：（包含日文，中文，韩语，英文）Input Example 4: (including Japanese, Chinese, Korean, English)
+    # text = "你的名字叫<ja>佐々木？<ja>吗？韩语中的안녕 오빠读什么呢？あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型和三款Apple Watch等一系列新品，这次的iPad Air采用了LCD屏幕"
+    # 试验性支持："fr"法语 , "vi"越南语 , "ru"俄语 , "th"泰语。Experimental: Other language support.
+    LangSegment.setfilters(["fr", "vi", "ja", "zh", "ko", "en", "ru", "th"])
+    text = """
+我喜欢在雨天里听音乐。
+I enjoy listening to music on rainy days.
+雨の日に音楽を聴くのが好きです。
+비 오는 날에 음악을 듣는 것을 즐깁니다。
+J'aime écouter de la musique les jours de pluie.
+Tôi thích nghe nhạc vào những ngày mưa.
+Мне нравится слушать музыку в дождливую погоду.
+ฉันชอบฟังเพ���งในวันที่ฝนตก
+"""
+    # 进行分词：（接入TTS项目仅需一行代码调用）Segmentation: (Only one line of code is required to access the TTS project)
+    langlist = LangSegment.getTexts(text)
+    printList(langlist)
+    # 语种统计:Language statistics:
+    print("\n===================【语种统计】===================")
+    # 获取所有语种数组结果，根据内容字数降序排列
+    # Get the array results in all languages, sorted in descending order according to the number of content words
+    langCounts = LangSegment.getCounts()
+    print(langCounts, "\n")
+    # 根据结果获取内容的主要语种 (语言，字数含标点)
+    # Get the main language of content based on the results (language, word count including punctuation)
+    lang, count = langCounts[0]
+    print(f"输入内容的主要语言为 = {lang} ，字数 = {count}")
+    print("==================================================\n")
+    # 分词输出：lang=语言，text=内容。Word output: lang = language, text = content
+    # ===================【打印结果】===================
+    # {'lang': 'zh', 'text': '你的名字叫'}
+    # {'lang': 'ja', 'text': '佐々木？'}
+    # {'lang': 'zh', 'text': '吗？韩语中的'}
+    # {'lang': 'ko', 'text': '안녕 오빠'}
+    # {'lang': 'zh', 'text': '读什么呢？'}
+    # {'lang': 'ja', 'text': 'あなたの体育の先生は誰ですか?'}
+    # {'lang': 'zh', 'text': ' 此次发布会带来了四款'}
+    # {'lang': 'en', 'text': 'i Phone  '}
+    # {'lang': 'zh', 'text': '15系列机型和三款'}
+    # {'lang': 'en', 'text': 'Apple Watch '}
+    # {'lang': 'zh', 'text': '等一系列新品，这次的'}
+    # {'lang': 'en', 'text': 'i Pad Air '}
+    # {'lang': 'zh', 'text': '采用了'}
+    # {'lang': 'en', 'text': 'L C D '}
+    # {'lang': 'zh', 'text': '屏幕'}
+    # ===================【语种统计】===================
+    # ===================【语种统计】===================
+    # [('zh', 51), ('ja', 19), ('en', 18), ('ko', 5)]
+    # 输入内容的主要语言为 = zh ，字数 = 51
+    # ==================================================
+    # The main language of the input content is = zh, word count = 51
+if __name__ == "__main__":
+    main()

src/YingMusicSinger/utils/f5_tts/thirdparty/LangSegment/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from .LangSegment import (
+    LangSegment,
+    classify,
+    getCounts,
+    getEnablePreview,
+    getfilters,
+    getKeepPinyin,
+    getLangMerge,
+    getPriorityThreshold,
+    getTexts,
+    printList,
+    setEnablePreview,
+    setfilters,
+    setKeepPinyin,
+    setLangMerge,
+    setPriorityThreshold,
+)
+# release
+__version__ = "0.3.5"
+# develop
+__develop__ = "dev-0.0.1"

src/YingMusicSinger/utils/f5_tts/thirdparty/LangSegment/utils/__init__.py ADDED Viewed

File without changes

src/YingMusicSinger/utils/f5_tts/thirdparty/LangSegment/utils/num.py ADDED Viewed

	@@ -0,0 +1,332 @@

+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Digital processing from GPT_SoVITS num.py （thanks）
+"""
+Rules to verbalize numbers into Chinese characters.
+https://zh.wikipedia.org/wiki/中文数字#現代中文
+"""
+import re
+from collections import OrderedDict
+from typing import List
+DIGITS = {str(i): tran for i, tran in enumerate("零一二三四五六七八九")}
+UNITS = OrderedDict(
+    {
+        1: "十",
+        2: "百",
+        3: "千",
+        4: "万",
+        8: "亿",
+    }
+)
+COM_QUANTIFIERS = "(处|台|架|枚|趟|幅|平|方|堵|间|床|株|批|项|例|列|篇|栋|注|亩|封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)"
+# 分数表达式
+RE_FRAC = re.compile(r"(-?)(\d+)/(\d+)")
+def replace_frac(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    nominator = match.group(2)
+    denominator = match.group(3)
+    sign: str = "负" if sign else ""
+    nominator: str = num2str(nominator)
+    denominator: str = num2str(denominator)
+    result = f"{sign}{denominator}分之{nominator}"
+    return result
+# 百分数表达式
+RE_PERCENTAGE = re.compile(r"(-?)(\d+(\.\d+)?)%")
+def replace_percentage(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    percent = match.group(2)
+    sign: str = "负" if sign else ""
+    percent: str = num2str(percent)
+    result = f"{sign}百分之{percent}"
+    return result
+# 整数表达式
+# 带负号的整数 -10
+RE_INTEGER = re.compile(r"(-)" r"(\d+)")
+def replace_negative_num(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    number = match.group(2)
+    sign: str = "负" if sign else ""
+    number: str = num2str(number)
+    result = f"{sign}{number}"
+    return result
+# 编号-无符号整形
+# 00078
+RE_DEFAULT_NUM = re.compile(r"\d{3}\d*")
+def replace_default_num(match):
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    number = match.group(0)
+    return verbalize_digit(number, alt_one=True)
+# 加减乘除
+# RE_ASMD = re.compile(
+#     r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))([\+\-\×÷=])((-?)((\d+)(\.\d+)?)|(\.(\d+)))')
+RE_ASMD = re.compile(
+    r"((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))([\+\-\×÷=])((-?)((\d+)(\.\d+)?[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|(\.\d+[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*)|([A-Za-z][⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]*))"
+)
+asmd_map = {"+": "加", "-": "减", "×": "乘", "÷": "除", "=": "等于"}
+def replace_asmd(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    result = match.group(1) + asmd_map[match.group(8)] + match.group(9)
+    return result
+# 次方专项
+RE_POWER = re.compile(r"[⁰¹²³⁴⁵⁶⁷⁸⁹ˣʸⁿ]+")
+power_map = {
+    "⁰": "0",
+    "¹": "1",
+    "²": "2",
+    "³": "3",
+    "⁴": "4",
+    "⁵": "5",
+    "⁶": "6",
+    "⁷": "7",
+    "⁸": "8",
+    "⁹": "9",
+    "ˣ": "x",
+    "ʸ": "y",
+    "ⁿ": "n",
+}
+def replace_power(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    power_num = ""
+    for m in match.group(0):
+        power_num += power_map[m]
+    result = "的" + power_num + "次方"
+    return result
+# 数字表达式
+# 纯小数
+RE_DECIMAL_NUM = re.compile(r"(-?)((\d+)(\.\d+))" r"|(\.(\d+))")
+# 正整数 + 量词
+RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS)
+RE_NUMBER = re.compile(r"(-?)((\d+)(\.\d+)?)" r"|(\.(\d+))")
+def replace_positive_quantifier(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    number = match.group(1)
+    match_2 = match.group(2)
+    if match_2 == "+":
+        match_2 = "多"
+    match_2: str = match_2 if match_2 else ""
+    quantifiers: str = match.group(3)
+    number: str = num2str(number)
+    result = f"{number}{match_2}{quantifiers}"
+    return result
+def replace_number(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    sign = match.group(1)
+    number = match.group(2)
+    pure_decimal = match.group(5)
+    if pure_decimal:
+        result = num2str(pure_decimal)
+    else:
+        sign: str = "负" if sign else ""
+        number: str = num2str(number)
+        result = f"{sign}{number}"
+    return result
+# 范围表达式
+# match.group(1) and match.group(8) are copy from RE_NUMBER
+RE_RANGE = re.compile(
+    r"""
+    (?<![\d\+\-\×÷=])      # 使用反向前瞻以确保数字范围之前没有其他数字和操作符
+    ((-?)((\d+)(\.\d+)?))  # 匹配范围起始的负数或正数（整数或小数）
+    [-~]                   # 匹配范围分隔符
+    ((-?)((\d+)(\.\d+)?))  # 匹配范围结束的负数或正数（整数或小数）
+    (?![\d\+\-\×÷=])       # 使用正向前瞻以确保数字范围之后没有其他数字和操作符
+    """,
+    re.VERBOSE,
+)
+def replace_range(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    first, second = match.group(1), match.group(6)
+    first = RE_NUMBER.sub(replace_number, first)
+    second = RE_NUMBER.sub(replace_number, second)
+    result = f"{first}到{second}"
+    return result
+# ~至表达式
+RE_TO_RANGE = re.compile(
+    r"((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)[~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))(%|°C|℃|度|摄氏度|cm2|cm²|cm3|cm³|cm|db|ds|kg|km|m2|m²|m³|m3|ml|m|mm|s)"
+)
+def replace_to_range(match) -> str:
+    """
+    Args:
+        match (re.Match)
+    Returns:
+        str
+    """
+    result = match.group(0).replace("~", "至")
+    return result
+def _get_value(value_string: str, use_zero: bool = True) -> List[str]:
+    stripped = value_string.lstrip("0")
+    if len(stripped) == 0:
+        return []
+    elif len(stripped) == 1:
+        if use_zero and len(stripped) < len(value_string):
+            return [DIGITS["0"], DIGITS[stripped]]
+        else:
+            return [DIGITS[stripped]]
+    else:
+        largest_unit = next(
+            power for power in reversed(UNITS.keys()) if power < len(stripped)
+        )
+        first_part = value_string[:-largest_unit]
+        second_part = value_string[-largest_unit:]
+        return _get_value(first_part) + [UNITS[largest_unit]] + _get_value(second_part)
+def verbalize_cardinal(value_string: str) -> str:
+    if not value_string:
+        return ""
+    # 000 -> '零' , 0 -> '零'
+    value_string = value_string.lstrip("0")
+    if len(value_string) == 0:
+        return DIGITS["0"]
+    result_symbols = _get_value(value_string)
+    # verbalized number starting with '一十*' is abbreviated as `十*`
+    if (
+        len(result_symbols) >= 2
+        and result_symbols[0] == DIGITS["1"]
+        and result_symbols[1] == UNITS[1]
+    ):
+        result_symbols = result_symbols[1:]
+    return "".join(result_symbols)
+def verbalize_digit(value_string: str, alt_one=False) -> str:
+    result_symbols = [DIGITS[digit] for digit in value_string]
+    result = "".join(result_symbols)
+    if alt_one:
+        result = result.replace("一", "幺")
+    return result
+def num2str(value_string: str) -> str:
+    integer_decimal = value_string.split(".")
+    if len(integer_decimal) == 1:
+        integer = integer_decimal[0]
+        decimal = ""
+    elif len(integer_decimal) == 2:
+        integer, decimal = integer_decimal
+    else:
+        raise ValueError(
+            f"The value string: '${value_string}' has more than one point in it."
+        )
+    result = verbalize_cardinal(integer)
+    decimal = decimal.rstrip("0")
+    if decimal:
+        # '.22' is verbalized as '零点二二'
+        # '3.20' is verbalized as '三点二
+        result = result if result else "零"
+        result += "点" + verbalize_digit(decimal)
+    return result
+if __name__ == "__main__":
+    text = ""
+    text = num2str(text)
+    print(text)
+    pass

src/YingMusicSinger/utils/stable_audio_tools/__init__.py ADDED Viewed

File without changes

src/YingMusicSinger/utils/stable_audio_tools/adp.py ADDED Viewed

	@@ -0,0 +1,1686 @@

+# Copied and modified from https://github.com/archinetai/audio-diffusion-pytorch/blob/v0.0.94/audio_diffusion_pytorch/modules.py under MIT License
+# License can be found in LICENSES/LICENSE_ADP.txt
+import math
+from inspect import isfunction
+from math import ceil, floor, log, log2, pi
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, TypeVar, Union
+import torch
+import torch.nn as nn
+from dac.nn.layers import Snake1d
+from einops import rearrange, reduce, repeat
+from einops.layers.torch import Rearrange
+from einops_exts import rearrange_many
+from packaging import version
+from torch import Tensor, einsum
+from torch.backends.cuda import sdp_kernel
+from torch.nn import functional as F
+"""
+Utils
+"""
+class ConditionedSequential(nn.Module):
+    def __init__(self, *modules):
+        super().__init__()
+        self.module_list = nn.ModuleList(*modules)
+    def forward(self, x: Tensor, mapping: Optional[Tensor] = None):
+        for module in self.module_list:
+            x = module(x, mapping)
+        return x
+T = TypeVar("T")
+def default(val: Optional[T], d: Union[Callable[..., T], T]) -> T:
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def exists(val: Optional[T]) -> T:
+    return val is not None
+def closest_power_2(x: float) -> int:
+    exponent = log2(x)
+    distance_fn = lambda z: abs(x - 2**z)  # noqa
+    exponent_closest = min((floor(exponent), ceil(exponent)), key=distance_fn)
+    return 2 ** int(exponent_closest)
+def group_dict_by_prefix(prefix: str, d: Dict) -> Tuple[Dict, Dict]:
+    return_dicts: Tuple[Dict, Dict] = ({}, {})
+    for key in d.keys():
+        no_prefix = int(not key.startswith(prefix))
+        return_dicts[no_prefix][key] = d[key]
+    return return_dicts
+def groupby(prefix: str, d: Dict, keep_prefix: bool = False) -> Tuple[Dict, Dict]:
+    kwargs_with_prefix, kwargs = group_dict_by_prefix(prefix, d)
+    if keep_prefix:
+        return kwargs_with_prefix, kwargs
+    kwargs_no_prefix = {k[len(prefix) :]: v for k, v in kwargs_with_prefix.items()}
+    return kwargs_no_prefix, kwargs
+"""
+Convolutional Blocks
+"""
+import typing as tp
+# Copied from https://github.com/facebookresearch/audiocraft/blob/main/audiocraft/modules/conv.py under MIT License
+# License available in LICENSES/LICENSE_META.txt
+def get_extra_padding_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+) -> int:
+    """See `pad_for_conv1d`."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad_for_conv1d(
+    x: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
+):
+    """Pad for a convolution to make sure that the last window is full.
+    Extra padding is added at the end. This is required to ensure that we can rebuild
+    an output of the same length, as otherwise, even with padding, some time steps
+    might get removed.
+    For instance, with total padding = 4, kernel size = 4, stride = 2:
+        0 0 1 2 3 4 5 0 0   # (0s are padding)
+        1   2   3           # (output frames of a convolution, last 0 is never used)
+        0 0 1 2 3 4 5 0     # (output of tr. conv., but pos. 5 is going to get removed as padding)
+            1 2 3 4         # once you removed padding, we are missing one time step !
+    """
+    extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+    return F.pad(x, (0, extra_padding))
+def pad1d(
+    x: torch.Tensor,
+    paddings: tp.Tuple[int, int],
+    mode: str = "constant",
+    value: float = 0.0,
+):
+    """Tiny wrapper around F.pad, just to allow for reflect padding on small input.
+    If this is the case, we insert extra 0 padding to the right before the reflection happen.
+    """
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == "reflect":
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left:end]
+class Conv1d(nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x: Tensor, causal=False) -> Tensor:
+        kernel_size = self.kernel_size[0]
+        stride = self.stride[0]
+        dilation = self.dilation[0]
+        kernel_size = (
+            kernel_size - 1
+        ) * dilation + 1  # effective kernel size with dilations
+        padding_total = kernel_size - stride
+        extra_padding = get_extra_padding_for_conv1d(
+            x, kernel_size, stride, padding_total
+        )
+        if causal:
+            # Left padding for causal
+            x = pad1d(x, (padding_total, extra_padding))
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(x, (padding_left, padding_right + extra_padding))
+        return super().forward(x)
+class ConvTranspose1d(nn.ConvTranspose1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x: Tensor, causal=False) -> Tensor:
+        kernel_size = self.kernel_size[0]
+        stride = self.stride[0]
+        padding_total = kernel_size - stride
+        y = super().forward(x)
+        # We will only trim fixed padding. Extra padding from `pad_for_conv1d` would be
+        # removed at the very end, when keeping only the right length for the output,
+        # as removing it here would require also passing the length at the matching layer
+        # in the encoder.
+        if causal:
+            padding_right = ceil(padding_total)
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        else:
+            # Asymmetric padding required for odd strides
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        return y
+def Downsample1d(
+    in_channels: int, out_channels: int, factor: int, kernel_multiplier: int = 2
+) -> nn.Module:
+    assert kernel_multiplier % 2 == 0, "Kernel multiplier must be even"
+    return Conv1d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=factor * kernel_multiplier + 1,
+        stride=factor,
+    )
+def Upsample1d(
+    in_channels: int, out_channels: int, factor: int, use_nearest: bool = False
+) -> nn.Module:
+    if factor == 1:
+        return Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3)
+    if use_nearest:
+        return nn.Sequential(
+            nn.Upsample(scale_factor=factor, mode="nearest"),
+            Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=3),
+        )
+    else:
+        return ConvTranspose1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=factor * 2,
+            stride=factor,
+        )
+class ConvBlock1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        *,
+        kernel_size: int = 3,
+        stride: int = 1,
+        dilation: int = 1,
+        num_groups: int = 8,
+        use_norm: bool = True,
+        use_snake: bool = False,
+    ) -> None:
+        super().__init__()
+        self.groupnorm = (
+            nn.GroupNorm(num_groups=num_groups, num_channels=in_channels)
+            if use_norm
+            else nn.Identity()
+        )
+        if use_snake:
+            self.activation = Snake1d(in_channels)
+        else:
+            self.activation = nn.SiLU()
+        self.project = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+        )
+    def forward(
+        self,
+        x: Tensor,
+        scale_shift: Optional[Tuple[Tensor, Tensor]] = None,
+        causal=False,
+    ) -> Tensor:
+        x = self.groupnorm(x)
+        if exists(scale_shift):
+            scale, shift = scale_shift
+            x = x * (scale + 1) + shift
+        x = self.activation(x)
+        return self.project(x, causal=causal)
+class MappingToScaleShift(nn.Module):
+    def __init__(
+        self,
+        features: int,
+        channels: int,
+    ):
+        super().__init__()
+        self.to_scale_shift = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(in_features=features, out_features=channels * 2),
+        )
+    def forward(self, mapping: Tensor) -> Tuple[Tensor, Tensor]:
+        scale_shift = self.to_scale_shift(mapping)
+        scale_shift = rearrange(scale_shift, "b c -> b c 1")
+        scale, shift = scale_shift.chunk(2, dim=1)
+        return scale, shift
+class ResnetBlock1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        *,
+        kernel_size: int = 3,
+        stride: int = 1,
+        dilation: int = 1,
+        use_norm: bool = True,
+        use_snake: bool = False,
+        num_groups: int = 8,
+        context_mapping_features: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.use_mapping = exists(context_mapping_features)
+        self.block1 = ConvBlock1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            use_norm=use_norm,
+            num_groups=num_groups,
+            use_snake=use_snake,
+        )
+        if self.use_mapping:
+            assert exists(context_mapping_features)
+            self.to_scale_shift = MappingToScaleShift(
+                features=context_mapping_features, channels=out_channels
+            )
+        self.block2 = ConvBlock1d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            use_norm=use_norm,
+            num_groups=num_groups,
+            use_snake=use_snake,
+        )
+        self.to_out = (
+            Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(
+        self, x: Tensor, mapping: Optional[Tensor] = None, causal=False
+    ) -> Tensor:
+        assert_message = "context mapping required if context_mapping_features > 0"
+        assert not (self.use_mapping ^ exists(mapping)), assert_message
+        h = self.block1(x, causal=causal)
+        scale_shift = None
+        if self.use_mapping:
+            scale_shift = self.to_scale_shift(mapping)
+        h = self.block2(h, scale_shift=scale_shift, causal=causal)
+        return h + self.to_out(x)
+class Patcher(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        patch_size: int,
+        context_mapping_features: Optional[int] = None,
+        use_snake: bool = False,
+    ):
+        super().__init__()
+        assert_message = f"out_channels must be divisible by patch_size ({patch_size})"
+        assert out_channels % patch_size == 0, assert_message
+        self.patch_size = patch_size
+        self.block = ResnetBlock1d(
+            in_channels=in_channels,
+            out_channels=out_channels // patch_size,
+            num_groups=1,
+            context_mapping_features=context_mapping_features,
+            use_snake=use_snake,
+        )
+    def forward(
+        self, x: Tensor, mapping: Optional[Tensor] = None, causal=False
+    ) -> Tensor:
+        x = self.block(x, mapping, causal=causal)
+        x = rearrange(x, "b c (l p) -> b (c p) l", p=self.patch_size)
+        return x
+class Unpatcher(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        patch_size: int,
+        context_mapping_features: Optional[int] = None,
+        use_snake: bool = False,
+    ):
+        super().__init__()
+        assert_message = f"in_channels must be divisible by patch_size ({patch_size})"
+        assert in_channels % patch_size == 0, assert_message
+        self.patch_size = patch_size
+        self.block = ResnetBlock1d(
+            in_channels=in_channels // patch_size,
+            out_channels=out_channels,
+            num_groups=1,
+            context_mapping_features=context_mapping_features,
+            use_snake=use_snake,
+        )
+    def forward(
+        self, x: Tensor, mapping: Optional[Tensor] = None, causal=False
+    ) -> Tensor:
+        x = rearrange(x, " b (c p) l -> b c (l p) ", p=self.patch_size)
+        x = self.block(x, mapping, causal=causal)
+        return x
+"""
+Attention Components
+"""
+def FeedForward(features: int, multiplier: int) -> nn.Module:
+    mid_features = features * multiplier
+    return nn.Sequential(
+        nn.Linear(in_features=features, out_features=mid_features),
+        nn.GELU(),
+        nn.Linear(in_features=mid_features, out_features=features),
+    )
+def add_mask(sim: Tensor, mask: Tensor) -> Tensor:
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+def causal_mask(q: Tensor, k: Tensor) -> Tensor:
+    b, i, j, device = q.shape[0], q.shape[-2], k.shape[-2], q.device
+    mask = ~torch.ones((i, j), dtype=torch.bool, device=device).triu(j - i + 1)
+    mask = repeat(mask, "n m -> b n m", b=b)
+    return mask
+class AttentionBase(nn.Module):
+    def __init__(
+        self,
+        features: int,
+        *,
+        head_features: int,
+        num_heads: int,
+        out_features: Optional[int] = None,
+    ):
+        super().__init__()
+        self.scale = head_features**-0.5
+        self.num_heads = num_heads
+        mid_features = head_features * num_heads
+        out_features = default(out_features, features)
+        self.to_out = nn.Linear(in_features=mid_features, out_features=out_features)
+        self.use_flash = torch.cuda.is_available() and version.parse(
+            torch.__version__
+        ) >= version.parse("2.0.0")
+        if not self.use_flash:
+            return
+        device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
+        if device_properties.major == 8 and device_properties.minor == 0:
+            # Use flash attention for A100 GPUs
+            self.sdp_kernel_config = (True, False, False)
+        else:
+            # Don't use flash attention for other GPUs
+            self.sdp_kernel_config = (False, True, True)
+    def forward(
+        self,
+        q: Tensor,
+        k: Tensor,
+        v: Tensor,
+        mask: Optional[Tensor] = None,
+        is_causal: bool = False,
+    ) -> Tensor:
+        # Split heads
+        q, k, v = rearrange_many((q, k, v), "b n (h d) -> b h n d", h=self.num_heads)
+        if not self.use_flash:
+            if is_causal and not mask:
+                # Mask out future tokens for causal attention
+                mask = causal_mask(q, k)
+            # Compute similarity matrix and add eventual mask
+            sim = einsum("... n d, ... m d -> ... n m", q, k) * self.scale
+            sim = add_mask(sim, mask) if exists(mask) else sim
+            # Get attention matrix with softmax
+            attn = sim.softmax(dim=-1, dtype=torch.float32)
+            # Compute values
+            out = einsum("... n m, ... m d -> ... n d", attn, v)
+        else:
+            with sdp_kernel(*self.sdp_kernel_config):
+                out = F.scaled_dot_product_attention(
+                    q, k, v, attn_mask=mask, is_causal=is_causal
+                )
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        features: int,
+        *,
+        head_features: int,
+        num_heads: int,
+        out_features: Optional[int] = None,
+        context_features: Optional[int] = None,
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.context_features = context_features
+        self.causal = causal
+        mid_features = head_features * num_heads
+        context_features = default(context_features, features)
+        self.norm = nn.LayerNorm(features)
+        self.norm_context = nn.LayerNorm(context_features)
+        self.to_q = nn.Linear(
+            in_features=features, out_features=mid_features, bias=False
+        )
+        self.to_kv = nn.Linear(
+            in_features=context_features, out_features=mid_features * 2, bias=False
+        )
+        self.attention = AttentionBase(
+            features,
+            num_heads=num_heads,
+            head_features=head_features,
+            out_features=out_features,
+        )
+    def forward(
+        self,
+        x: Tensor,  # [b, n, c]
+        context: Optional[Tensor] = None,  # [b, m, d]
+        context_mask: Optional[Tensor] = None,  # [b, m], false is masked,
+        causal: Optional[bool] = False,
+    ) -> Tensor:
+        assert_message = "You must provide a context when using context_features"
+        assert not self.context_features or exists(context), assert_message
+        # Use context if provided
+        context = default(context, x)
+        # Normalize then compute q from input and k,v from context
+        x, context = self.norm(x), self.norm_context(context)
+        q, k, v = (self.to_q(x), *torch.chunk(self.to_kv(context), chunks=2, dim=-1))
+        if exists(context_mask):
+            # Mask out cross-attention for padding tokens
+            mask = repeat(context_mask, "b m -> b m d", d=v.shape[-1])
+            k, v = k * mask, v * mask
+        # Compute and return attention
+        return self.attention(q, k, v, is_causal=self.causal or causal)
+def FeedForward(features: int, multiplier: int) -> nn.Module:
+    mid_features = features * multiplier
+    return nn.Sequential(
+        nn.Linear(in_features=features, out_features=mid_features),
+        nn.GELU(),
+        nn.Linear(in_features=mid_features, out_features=features),
+    )
+"""
+Transformer Blocks
+"""
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        features: int,
+        num_heads: int,
+        head_features: int,
+        multiplier: int,
+        context_features: Optional[int] = None,
+    ):
+        super().__init__()
+        self.use_cross_attention = exists(context_features) and context_features > 0
+        self.attention = Attention(
+            features=features, num_heads=num_heads, head_features=head_features
+        )
+        if self.use_cross_attention:
+            self.cross_attention = Attention(
+                features=features,
+                num_heads=num_heads,
+                head_features=head_features,
+                context_features=context_features,
+            )
+        self.feed_forward = FeedForward(features=features, multiplier=multiplier)
+    def forward(
+        self,
+        x: Tensor,
+        *,
+        context: Optional[Tensor] = None,
+        context_mask: Optional[Tensor] = None,
+        causal: Optional[bool] = False,
+    ) -> Tensor:
+        x = self.attention(x, causal=causal) + x
+        if self.use_cross_attention:
+            x = self.cross_attention(x, context=context, context_mask=context_mask) + x
+        x = self.feed_forward(x) + x
+        return x
+"""
+Transformers
+"""
+class Transformer1d(nn.Module):
+    def __init__(
+        self,
+        num_layers: int,
+        channels: int,
+        num_heads: int,
+        head_features: int,
+        multiplier: int,
+        context_features: Optional[int] = None,
+    ):
+        super().__init__()
+        self.to_in = nn.Sequential(
+            nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6, affine=True),
+            Conv1d(
+                in_channels=channels,
+                out_channels=channels,
+                kernel_size=1,
+            ),
+            Rearrange("b c t -> b t c"),
+        )
+        self.blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    features=channels,
+                    head_features=head_features,
+                    num_heads=num_heads,
+                    multiplier=multiplier,
+                    context_features=context_features,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.to_out = nn.Sequential(
+            Rearrange("b t c -> b c t"),
+            Conv1d(
+                in_channels=channels,
+                out_channels=channels,
+                kernel_size=1,
+            ),
+        )
+    def forward(
+        self,
+        x: Tensor,
+        *,
+        context: Optional[Tensor] = None,
+        context_mask: Optional[Tensor] = None,
+        causal=False,
+    ) -> Tensor:
+        x = self.to_in(x)
+        for block in self.blocks:
+            x = block(x, context=context, context_mask=context_mask, causal=causal)
+        x = self.to_out(x)
+        return x
+"""
+Time Embeddings
+"""
+class SinusoidalEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x: Tensor) -> Tensor:
+        device, half_dim = x.device, self.dim // 2
+        emb = torch.tensor(log(10000) / (half_dim - 1), device=device)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = rearrange(x, "i -> i 1") * rearrange(emb, "j -> 1 j")
+        return torch.cat((emb.sin(), emb.cos()), dim=-1)
+class LearnedPositionalEmbedding(nn.Module):
+    """Used for continuous time"""
+    def __init__(self, dim: int):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        self.weights = nn.Parameter(torch.randn(half_dim))
+    def forward(self, x: Tensor) -> Tensor:
+        x = rearrange(x, "b -> b 1")
+        freqs = x * rearrange(self.weights, "d -> 1 d") * 2 * pi
+        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
+        fouriered = torch.cat((x, fouriered), dim=-1)
+        return fouriered
+def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
+    return nn.Sequential(
+        LearnedPositionalEmbedding(dim),
+        nn.Linear(in_features=dim + 1, out_features=out_features),
+    )
+"""
+Encoder/Decoder Components
+"""
+class DownsampleBlock1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        *,
+        factor: int,
+        num_groups: int,
+        num_layers: int,
+        kernel_multiplier: int = 2,
+        use_pre_downsample: bool = True,
+        use_skip: bool = False,
+        use_snake: bool = False,
+        extract_channels: int = 0,
+        context_channels: int = 0,
+        num_transformer_blocks: int = 0,
+        attention_heads: Optional[int] = None,
+        attention_features: Optional[int] = None,
+        attention_multiplier: Optional[int] = None,
+        context_mapping_features: Optional[int] = None,
+        context_embedding_features: Optional[int] = None,
+    ):
+        super().__init__()
+        self.use_pre_downsample = use_pre_downsample
+        self.use_skip = use_skip
+        self.use_transformer = num_transformer_blocks > 0
+        self.use_extract = extract_channels > 0
+        self.use_context = context_channels > 0
+        channels = out_channels if use_pre_downsample else in_channels
+        self.downsample = Downsample1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            factor=factor,
+            kernel_multiplier=kernel_multiplier,
+        )
+        self.blocks = nn.ModuleList(
+            [
+                ResnetBlock1d(
+                    in_channels=channels + context_channels if i == 0 else channels,
+                    out_channels=channels,
+                    num_groups=num_groups,
+                    context_mapping_features=context_mapping_features,
+                    use_snake=use_snake,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        if self.use_transformer:
+            assert (exists(attention_heads) or exists(attention_features)) and exists(
+                attention_multiplier
+            )
+            if attention_features is None and attention_heads is not None:
+                attention_features = channels // attention_heads
+            if attention_heads is None and attention_features is not None:
+                attention_heads = channels // attention_features
+            self.transformer = Transformer1d(
+                num_layers=num_transformer_blocks,
+                channels=channels,
+                num_heads=attention_heads,
+                head_features=attention_features,
+                multiplier=attention_multiplier,
+                context_features=context_embedding_features,
+            )
+        if self.use_extract:
+            num_extract_groups = min(num_groups, extract_channels)
+            self.to_extracted = ResnetBlock1d(
+                in_channels=out_channels,
+                out_channels=extract_channels,
+                num_groups=num_extract_groups,
+                use_snake=use_snake,
+            )
+    def forward(
+        self,
+        x: Tensor,
+        *,
+        mapping: Optional[Tensor] = None,
+        channels: Optional[Tensor] = None,
+        embedding: Optional[Tensor] = None,
+        embedding_mask: Optional[Tensor] = None,
+        causal: Optional[bool] = False,
+    ) -> Union[Tuple[Tensor, List[Tensor]], Tensor]:
+        if self.use_pre_downsample:
+            x = self.downsample(x)
+        if self.use_context and exists(channels):
+            x = torch.cat([x, channels], dim=1)
+        skips = []
+        for block in self.blocks:
+            x = block(x, mapping=mapping, causal=causal)
+            skips += [x] if self.use_skip else []
+        if self.use_transformer:
+            x = self.transformer(
+                x, context=embedding, context_mask=embedding_mask, causal=causal
+            )
+            skips += [x] if self.use_skip else []
+        if not self.use_pre_downsample:
+            x = self.downsample(x)
+        if self.use_extract:
+            extracted = self.to_extracted(x)
+            return x, extracted
+        return (x, skips) if self.use_skip else x
+class UpsampleBlock1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        *,
+        factor: int,
+        num_layers: int,
+        num_groups: int,
+        use_nearest: bool = False,
+        use_pre_upsample: bool = False,
+        use_skip: bool = False,
+        use_snake: bool = False,
+        skip_channels: int = 0,
+        use_skip_scale: bool = False,
+        extract_channels: int = 0,
+        num_transformer_blocks: int = 0,
+        attention_heads: Optional[int] = None,
+        attention_features: Optional[int] = None,
+        attention_multiplier: Optional[int] = None,
+        context_mapping_features: Optional[int] = None,
+        context_embedding_features: Optional[int] = None,
+    ):
+        super().__init__()
+        self.use_extract = extract_channels > 0
+        self.use_pre_upsample = use_pre_upsample
+        self.use_transformer = num_transformer_blocks > 0
+        self.use_skip = use_skip
+        self.skip_scale = 2**-0.5 if use_skip_scale else 1.0
+        channels = out_channels if use_pre_upsample else in_channels
+        self.blocks = nn.ModuleList(
+            [
+                ResnetBlock1d(
+                    in_channels=channels + skip_channels,
+                    out_channels=channels,
+                    num_groups=num_groups,
+                    context_mapping_features=context_mapping_features,
+                    use_snake=use_snake,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        if self.use_transformer:
+            assert (exists(attention_heads) or exists(attention_features)) and exists(
+                attention_multiplier
+            )
+            if attention_features is None and attention_heads is not None:
+                attention_features = channels // attention_heads
+            if attention_heads is None and attention_features is not None:
+                attention_heads = channels // attention_features
+            self.transformer = Transformer1d(
+                num_layers=num_transformer_blocks,
+                channels=channels,
+                num_heads=attention_heads,
+                head_features=attention_features,
+                multiplier=attention_multiplier,
+                context_features=context_embedding_features,
+            )
+        self.upsample = Upsample1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            factor=factor,
+            use_nearest=use_nearest,
+        )
+        if self.use_extract:
+            num_extract_groups = min(num_groups, extract_channels)
+            self.to_extracted = ResnetBlock1d(
+                in_channels=out_channels,
+                out_channels=extract_channels,
+                num_groups=num_extract_groups,
+                use_snake=use_snake,
+            )
+    def add_skip(self, x: Tensor, skip: Tensor) -> Tensor:
+        return torch.cat([x, skip * self.skip_scale], dim=1)
+    def forward(
+        self,
+        x: Tensor,
+        *,
+        skips: Optional[List[Tensor]] = None,
+        mapping: Optional[Tensor] = None,
+        embedding: Optional[Tensor] = None,
+        embedding_mask: Optional[Tensor] = None,
+        causal: Optional[bool] = False,
+    ) -> Union[Tuple[Tensor, Tensor], Tensor]:
+        if self.use_pre_upsample:
+            x = self.upsample(x)
+        for block in self.blocks:
+            x = self.add_skip(x, skip=skips.pop()) if exists(skips) else x
+            x = block(x, mapping=mapping, causal=causal)
+        if self.use_transformer:
+            x = self.transformer(
+                x, context=embedding, context_mask=embedding_mask, causal=causal
+            )
+        if not self.use_pre_upsample:
+            x = self.upsample(x)
+        if self.use_extract:
+            extracted = self.to_extracted(x)
+            return x, extracted
+        return x
+class BottleneckBlock1d(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        *,
+        num_groups: int,
+        num_transformer_blocks: int = 0,
+        attention_heads: Optional[int] = None,
+        attention_features: Optional[int] = None,
+        attention_multiplier: Optional[int] = None,
+        context_mapping_features: Optional[int] = None,
+        context_embedding_features: Optional[int] = None,
+        use_snake: bool = False,
+    ):
+        super().__init__()
+        self.use_transformer = num_transformer_blocks > 0
+        self.pre_block = ResnetBlock1d(
+            in_channels=channels,
+            out_channels=channels,
+            num_groups=num_groups,
+            context_mapping_features=context_mapping_features,
+            use_snake=use_snake,
+        )
+        if self.use_transformer:
+            assert (exists(attention_heads) or exists(attention_features)) and exists(
+                attention_multiplier
+            )
+            if attention_features is None and attention_heads is not None:
+                attention_features = channels // attention_heads
+            if attention_heads is None and attention_features is not None:
+                attention_heads = channels // attention_features
+            self.transformer = Transformer1d(
+                num_layers=num_transformer_blocks,
+                channels=channels,
+                num_heads=attention_heads,
+                head_features=attention_features,
+                multiplier=attention_multiplier,
+                context_features=context_embedding_features,
+            )
+        self.post_block = ResnetBlock1d(
+            in_channels=channels,
+            out_channels=channels,
+            num_groups=num_groups,
+            context_mapping_features=context_mapping_features,
+            use_snake=use_snake,
+        )
+    def forward(
+        self,
+        x: Tensor,
+        *,
+        mapping: Optional[Tensor] = None,
+        embedding: Optional[Tensor] = None,
+        embedding_mask: Optional[Tensor] = None,
+        causal: Optional[bool] = False,
+    ) -> Tensor:
+        x = self.pre_block(x, mapping=mapping, causal=causal)
+        if self.use_transformer:
+            x = self.transformer(
+                x, context=embedding, context_mask=embedding_mask, causal=causal
+            )
+        x = self.post_block(x, mapping=mapping, causal=causal)
+        return x
+"""
+UNet
+"""
+class UNet1d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        channels: int,
+        multipliers: Sequence[int],
+        factors: Sequence[int],
+        num_blocks: Sequence[int],
+        attentions: Sequence[int],
+        patch_size: int = 1,
+        resnet_groups: int = 8,
+        use_context_time: bool = True,
+        kernel_multiplier_downsample: int = 2,
+        use_nearest_upsample: bool = False,
+        use_skip_scale: bool = True,
+        use_snake: bool = False,
+        use_stft: bool = False,
+        use_stft_context: bool = False,
+        out_channels: Optional[int] = None,
+        context_features: Optional[int] = None,
+        context_features_multiplier: int = 4,
+        context_channels: Optional[Sequence[int]] = None,
+        context_embedding_features: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        out_channels = default(out_channels, in_channels)
+        context_channels = list(default(context_channels, []))
+        num_layers = len(multipliers) - 1
+        use_context_features = exists(context_features)
+        use_context_channels = len(context_channels) > 0
+        context_mapping_features = None
+        attention_kwargs, kwargs = groupby("attention_", kwargs, keep_prefix=True)
+        self.num_layers = num_layers
+        self.use_context_time = use_context_time
+        self.use_context_features = use_context_features
+        self.use_context_channels = use_context_channels
+        self.use_stft = use_stft
+        self.use_stft_context = use_stft_context
+        self.context_features = context_features
+        context_channels_pad_length = num_layers + 1 - len(context_channels)
+        context_channels = context_channels + [0] * context_channels_pad_length
+        self.context_channels = context_channels
+        self.context_embedding_features = context_embedding_features
+        if use_context_channels:
+            has_context = [c > 0 for c in context_channels]
+            self.has_context = has_context
+            self.channels_ids = [sum(has_context[:i]) for i in range(len(has_context))]
+        assert (
+            len(factors) == num_layers
+            and len(attentions) >= num_layers
+            and len(num_blocks) == num_layers
+        )
+        if use_context_time or use_context_features:
+            context_mapping_features = channels * context_features_multiplier
+            self.to_mapping = nn.Sequential(
+                nn.Linear(context_mapping_features, context_mapping_features),
+                nn.GELU(),
+                nn.Linear(context_mapping_features, context_mapping_features),
+                nn.GELU(),
+            )
+        if use_context_time:
+            assert exists(context_mapping_features)
+            self.to_time = nn.Sequential(
+                TimePositionalEmbedding(
+                    dim=channels, out_features=context_mapping_features
+                ),
+                nn.GELU(),
+            )
+        if use_context_features:
+            assert exists(context_features) and exists(context_mapping_features)
+            self.to_features = nn.Sequential(
+                nn.Linear(
+                    in_features=context_features, out_features=context_mapping_features
+                ),
+                nn.GELU(),
+            )
+        if use_stft:
+            stft_kwargs, kwargs = groupby("stft_", kwargs)
+            assert "num_fft" in stft_kwargs, "stft_num_fft required if use_stft=True"
+            stft_channels = (stft_kwargs["num_fft"] // 2 + 1) * 2
+            in_channels *= stft_channels
+            out_channels *= stft_channels
+            context_channels[0] *= stft_channels if use_stft_context else 1
+            assert exists(in_channels) and exists(out_channels)
+            self.stft = STFT(**stft_kwargs)
+        assert not kwargs, f"Unknown arguments: {', '.join(list(kwargs.keys()))}"
+        self.to_in = Patcher(
+            in_channels=in_channels + context_channels[0],
+            out_channels=channels * multipliers[0],
+            patch_size=patch_size,
+            context_mapping_features=context_mapping_features,
+            use_snake=use_snake,
+        )
+        self.downsamples = nn.ModuleList(
+            [
+                DownsampleBlock1d(
+                    in_channels=channels * multipliers[i],
+                    out_channels=channels * multipliers[i + 1],
+                    context_mapping_features=context_mapping_features,
+                    context_channels=context_channels[i + 1],
+                    context_embedding_features=context_embedding_features,
+                    num_layers=num_blocks[i],
+                    factor=factors[i],
+                    kernel_multiplier=kernel_multiplier_downsample,
+                    num_groups=resnet_groups,
+                    use_pre_downsample=True,
+                    use_skip=True,
+                    use_snake=use_snake,
+                    num_transformer_blocks=attentions[i],
+                    **attention_kwargs,
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.bottleneck = BottleneckBlock1d(
+            channels=channels * multipliers[-1],
+            context_mapping_features=context_mapping_features,
+            context_embedding_features=context_embedding_features,
+            num_groups=resnet_groups,
+            num_transformer_blocks=attentions[-1],
+            use_snake=use_snake,
+            **attention_kwargs,
+        )
+        self.upsamples = nn.ModuleList(
+            [
+                UpsampleBlock1d(
+                    in_channels=channels * multipliers[i + 1],
+                    out_channels=channels * multipliers[i],
+                    context_mapping_features=context_mapping_features,
+                    context_embedding_features=context_embedding_features,
+                    num_layers=num_blocks[i] + (1 if attentions[i] else 0),
+                    factor=factors[i],
+                    use_nearest=use_nearest_upsample,
+                    num_groups=resnet_groups,
+                    use_skip_scale=use_skip_scale,
+                    use_pre_upsample=False,
+                    use_skip=True,
+                    use_snake=use_snake,
+                    skip_channels=channels * multipliers[i + 1],
+                    num_transformer_blocks=attentions[i],
+                    **attention_kwargs,
+                )
+                for i in reversed(range(num_layers))
+            ]
+        )
+        self.to_out = Unpatcher(
+            in_channels=channels * multipliers[0],
+            out_channels=out_channels,
+            patch_size=patch_size,
+            context_mapping_features=context_mapping_features,
+            use_snake=use_snake,
+        )
+    def get_channels(
+        self, channels_list: Optional[Sequence[Tensor]] = None, layer: int = 0
+    ) -> Optional[Tensor]:
+        """Gets context channels at `layer` and checks that shape is correct"""
+        use_context_channels = self.use_context_channels and self.has_context[layer]
+        if not use_context_channels:
+            return None
+        assert exists(channels_list), "Missing context"
+        # Get channels index (skipping zero channel contexts)
+        channels_id = self.channels_ids[layer]
+        # Get channels
+        channels = channels_list[channels_id]
+        message = f"Missing context for layer {layer} at index {channels_id}"
+        assert exists(channels), message
+        # Check channels
+        num_channels = self.context_channels[layer]
+        message = f"Expected context with {num_channels} channels at idx {channels_id}"
+        assert channels.shape[1] == num_channels, message
+        # STFT channels if requested
+        channels = self.stft.encode1d(channels) if self.use_stft_context else channels  # type: ignore # noqa
+        return channels
+    def get_mapping(
+        self, time: Optional[Tensor] = None, features: Optional[Tensor] = None
+    ) -> Optional[Tensor]:
+        """Combines context time features and features into mapping"""
+        items, mapping = [], None
+        # Compute time features
+        if self.use_context_time:
+            assert_message = "use_context_time=True but no time features provided"
+            assert exists(time), assert_message
+            items += [self.to_time(time)]
+        # Compute features
+        if self.use_context_features:
+            assert_message = "context_features exists but no features provided"
+            assert exists(features), assert_message
+            items += [self.to_features(features)]
+        # Compute joint mapping
+        if self.use_context_time or self.use_context_features:
+            mapping = reduce(torch.stack(items), "n b m -> b m", "sum")
+            mapping = self.to_mapping(mapping)
+        return mapping
+    def forward(
+        self,
+        x: Tensor,
+        time: Optional[Tensor] = None,
+        *,
+        features: Optional[Tensor] = None,
+        channels_list: Optional[Sequence[Tensor]] = None,
+        embedding: Optional[Tensor] = None,
+        embedding_mask: Optional[Tensor] = None,
+        causal: Optional[bool] = False,
+    ) -> Tensor:
+        channels = self.get_channels(channels_list, layer=0)
+        # Apply stft if required
+        x = self.stft.encode1d(x) if self.use_stft else x  # type: ignore
+        # Concat context channels at layer 0 if provided
+        x = torch.cat([x, channels], dim=1) if exists(channels) else x
+        # Compute mapping from time and features
+        mapping = self.get_mapping(time, features)
+        x = self.to_in(x, mapping, causal=causal)
+        skips_list = [x]
+        for i, downsample in enumerate(self.downsamples):
+            channels = self.get_channels(channels_list, layer=i + 1)
+            x, skips = downsample(
+                x,
+                mapping=mapping,
+                channels=channels,
+                embedding=embedding,
+                embedding_mask=embedding_mask,
+                causal=causal,
+            )
+            skips_list += [skips]
+        x = self.bottleneck(
+            x,
+            mapping=mapping,
+            embedding=embedding,
+            embedding_mask=embedding_mask,
+            causal=causal,
+        )
+        for i, upsample in enumerate(self.upsamples):
+            skips = skips_list.pop()
+            x = upsample(
+                x,
+                skips=skips,
+                mapping=mapping,
+                embedding=embedding,
+                embedding_mask=embedding_mask,
+                causal=causal,
+            )
+        x += skips_list.pop()
+        x = self.to_out(x, mapping, causal=causal)
+        x = self.stft.decode1d(x) if self.use_stft else x
+        return x
+""" Conditioning Modules """
+class FixedEmbedding(nn.Module):
+    def __init__(self, max_length: int, features: int):
+        super().__init__()
+        self.max_length = max_length
+        self.embedding = nn.Embedding(max_length, features)
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size, length, device = *x.shape[0:2], x.device
+        assert_message = "Input sequence length must be <= max_length"
+        assert length <= self.max_length, assert_message
+        position = torch.arange(length, device=device)
+        fixed_embedding = self.embedding(position)
+        fixed_embedding = repeat(fixed_embedding, "n d -> b n d", b=batch_size)
+        return fixed_embedding
+def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
+    if proba == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif proba == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
+class UNetCFG1d(UNet1d):
+    """UNet1d with Classifier-Free Guidance"""
+    def __init__(
+        self,
+        context_embedding_max_length: int,
+        context_embedding_features: int,
+        use_xattn_time: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            context_embedding_features=context_embedding_features, **kwargs
+        )
+        self.use_xattn_time = use_xattn_time
+        if use_xattn_time:
+            assert exists(context_embedding_features)
+            self.to_time_embedding = nn.Sequential(
+                TimePositionalEmbedding(
+                    dim=kwargs["channels"], out_features=context_embedding_features
+                ),
+                nn.GELU(),
+            )
+            context_embedding_max_length += 1  # Add one for time embedding
+        self.fixed_embedding = FixedEmbedding(
+            max_length=context_embedding_max_length, features=context_embedding_features
+        )
+    def forward(  # type: ignore
+        self,
+        x: Tensor,
+        time: Tensor,
+        *,
+        embedding: Tensor,
+        embedding_mask: Optional[Tensor] = None,
+        embedding_scale: float = 1.0,
+        embedding_mask_proba: float = 0.0,
+        batch_cfg: bool = False,
+        rescale_cfg: bool = False,
+        scale_phi: float = 0.4,
+        negative_embedding: Optional[Tensor] = None,
+        negative_embedding_mask: Optional[Tensor] = None,
+        **kwargs,
+    ) -> Tensor:
+        b, device = embedding.shape[0], embedding.device
+        if self.use_xattn_time:
+            embedding = torch.cat(
+                [embedding, self.to_time_embedding(time).unsqueeze(1)], dim=1
+            )
+            if embedding_mask is not None:
+                embedding_mask = torch.cat(
+                    [embedding_mask, torch.ones((b, 1), device=device)], dim=1
+                )
+        fixed_embedding = self.fixed_embedding(embedding)
+        if embedding_mask_proba > 0.0:
+            # Randomly mask embedding
+            batch_mask = rand_bool(
+                shape=(b, 1, 1), proba=embedding_mask_proba, device=device
+            )
+            embedding = torch.where(batch_mask, fixed_embedding, embedding)
+        if embedding_scale != 1.0:
+            if batch_cfg:
+                batch_x = torch.cat([x, x], dim=0)
+                batch_time = torch.cat([time, time], dim=0)
+                if negative_embedding is not None:
+                    if negative_embedding_mask is not None:
+                        negative_embedding_mask = negative_embedding_mask.to(
+                            torch.bool
+                        ).unsqueeze(2)
+                        negative_embedding = torch.where(
+                            negative_embedding_mask, negative_embedding, fixed_embedding
+                        )
+                    batch_embed = torch.cat([embedding, negative_embedding], dim=0)
+                else:
+                    batch_embed = torch.cat([embedding, fixed_embedding], dim=0)
+                batch_mask = None
+                if embedding_mask is not None:
+                    batch_mask = torch.cat([embedding_mask, embedding_mask], dim=0)
+                batch_features = None
+                features = kwargs.pop("features", None)
+                if self.use_context_features:
+                    batch_features = torch.cat([features, features], dim=0)
+                batch_channels = None
+                channels_list = kwargs.pop("channels_list", None)
+                if self.use_context_channels:
+                    batch_channels = []
+                    for channels in channels_list:
+                        batch_channels += [torch.cat([channels, channels], dim=0)]
+                # Compute both normal and fixed embedding outputs
+                batch_out = super().forward(
+                    batch_x,
+                    batch_time,
+                    embedding=batch_embed,
+                    embedding_mask=batch_mask,
+                    features=batch_features,
+                    channels_list=batch_channels,
+                    **kwargs,
+                )
+                out, out_masked = batch_out.chunk(2, dim=0)
+            else:
+                # Compute both normal and fixed embedding outputs
+                out = super().forward(
+                    x,
+                    time,
+                    embedding=embedding,
+                    embedding_mask=embedding_mask,
+                    **kwargs,
+                )
+                out_masked = super().forward(
+                    x,
+                    time,
+                    embedding=fixed_embedding,
+                    embedding_mask=embedding_mask,
+                    **kwargs,
+                )
+            out_cfg = out_masked + (out - out_masked) * embedding_scale
+            if rescale_cfg:
+                out_std = out.std(dim=1, keepdim=True)
+                out_cfg_std = out_cfg.std(dim=1, keepdim=True)
+                return (
+                    scale_phi * (out_cfg * (out_std / out_cfg_std))
+                    + (1 - scale_phi) * out_cfg
+                )
+            else:
+                return out_cfg
+        else:
+            return super().forward(
+                x, time, embedding=embedding, embedding_mask=embedding_mask, **kwargs
+            )
+class UNetNCCA1d(UNet1d):
+    """UNet1d with Noise Channel Conditioning Augmentation"""
+    def __init__(self, context_features: int, **kwargs):
+        super().__init__(context_features=context_features, **kwargs)
+        self.embedder = NumberEmbedder(features=context_features)
+    def expand(self, x: Any, shape: Tuple[int, ...]) -> Tensor:
+        x = x if torch.is_tensor(x) else torch.tensor(x)
+        return x.expand(shape)
+    def forward(  # type: ignore
+        self,
+        x: Tensor,
+        time: Tensor,
+        *,
+        channels_list: Sequence[Tensor],
+        channels_augmentation: Union[
+            bool, Sequence[bool], Sequence[Sequence[bool]], Tensor
+        ] = False,
+        channels_scale: Union[
+            float, Sequence[float], Sequence[Sequence[float]], Tensor
+        ] = 0,
+        **kwargs,
+    ) -> Tensor:
+        b, n = x.shape[0], len(channels_list)
+        channels_augmentation = self.expand(channels_augmentation, shape=(b, n)).to(x)
+        channels_scale = self.expand(channels_scale, shape=(b, n)).to(x)
+        # Augmentation (for each channel list item)
+        for i in range(n):
+            scale = channels_scale[:, i] * channels_augmentation[:, i]
+            scale = rearrange(scale, "b -> b 1 1")
+            item = channels_list[i]
+            channels_list[i] = torch.randn_like(item) * scale + item * (1 - scale)  # type: ignore # noqa
+        # Scale embedding (sum reduction if more than one channel list item)
+        channels_scale_emb = self.embedder(channels_scale)
+        channels_scale_emb = reduce(channels_scale_emb, "b n d -> b d", "sum")
+        return super().forward(
+            x=x,
+            time=time,
+            channels_list=channels_list,
+            features=channels_scale_emb,
+            **kwargs,
+        )
+class UNetAll1d(UNetCFG1d, UNetNCCA1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, *args, **kwargs):  # type: ignore
+        return UNetCFG1d.forward(self, *args, **kwargs)
+def XUNet1d(type: str = "base", **kwargs) -> UNet1d:
+    if type == "base":
+        return UNet1d(**kwargs)
+    elif type == "all":
+        return UNetAll1d(**kwargs)
+    elif type == "cfg":
+        return UNetCFG1d(**kwargs)
+    elif type == "ncca":
+        return UNetNCCA1d(**kwargs)
+    else:
+        raise ValueError(f"Unknown XUNet1d type: {type}")
+class NumberEmbedder(nn.Module):
+    def __init__(
+        self,
+        features: int,
+        dim: int = 256,
+    ):
+        super().__init__()
+        self.features = features
+        self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)
+    def forward(self, x: Union[List[float], Tensor]) -> Tensor:
+        if not torch.is_tensor(x):
+            device = next(self.embedding.parameters()).device
+            x = torch.tensor(x, device=device)
+        assert isinstance(x, Tensor)
+        shape = x.shape
+        x = rearrange(x, "... -> (...)")
+        embedding = self.embedding(x)
+        x = embedding.view(*shape, self.features)
+        return x  # type: ignore
+"""
+Audio Transforms
+"""
+class STFT(nn.Module):
+    """Helper for torch stft and istft"""
+    def __init__(
+        self,
+        num_fft: int = 1023,
+        hop_length: int = 256,
+        window_length: Optional[int] = None,
+        length: Optional[int] = None,
+        use_complex: bool = False,
+    ):
+        super().__init__()
+        self.num_fft = num_fft
+        self.hop_length = default(hop_length, floor(num_fft // 4))
+        self.window_length = default(window_length, num_fft)
+        self.length = length
+        self.register_buffer("window", torch.hann_window(self.window_length))
+        self.use_complex = use_complex
+    def encode(self, wave: Tensor) -> Tuple[Tensor, Tensor]:
+        b = wave.shape[0]
+        wave = rearrange(wave, "b c t -> (b c) t")
+        stft = torch.stft(
+            wave,
+            n_fft=self.num_fft,
+            hop_length=self.hop_length,
+            win_length=self.window_length,
+            window=self.window,  # type: ignore
+            return_complex=True,
+            normalized=True,
+        )
+        if self.use_complex:
+            # Returns real and imaginary
+            stft_a, stft_b = stft.real, stft.imag
+        else:
+            # Returns magnitude and phase matrices
+            magnitude, phase = torch.abs(stft), torch.angle(stft)
+            stft_a, stft_b = magnitude, phase
+        return rearrange_many((stft_a, stft_b), "(b c) f l -> b c f l", b=b)
+    def decode(self, stft_a: Tensor, stft_b: Tensor) -> Tensor:
+        b, l = stft_a.shape[0], stft_a.shape[-1]  # noqa
+        length = closest_power_2(l * self.hop_length)
+        stft_a, stft_b = rearrange_many((stft_a, stft_b), "b c f l -> (b c) f l")
+        if self.use_complex:
+            real, imag = stft_a, stft_b
+        else:
+            magnitude, phase = stft_a, stft_b
+            real, imag = magnitude * torch.cos(phase), magnitude * torch.sin(phase)
+        stft = torch.stack([real, imag], dim=-1)
+        wave = torch.istft(
+            stft,
+            n_fft=self.num_fft,
+            hop_length=self.hop_length,
+            win_length=self.window_length,
+            window=self.window,  # type: ignore
+            length=default(self.length, length),
+            normalized=True,
+        )
+        return rearrange(wave, "(b c) t -> b c t", b=b)
+    def encode1d(
+        self, wave: Tensor, stacked: bool = True
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        stft_a, stft_b = self.encode(wave)
+        stft_a, stft_b = rearrange_many((stft_a, stft_b), "b c f l -> b (c f) l")
+        return torch.cat((stft_a, stft_b), dim=1) if stacked else (stft_a, stft_b)
+    def decode1d(self, stft_pair: Tensor) -> Tensor:
+        f = self.num_fft // 2 + 1
+        stft_a, stft_b = stft_pair.chunk(chunks=2, dim=1)
+        stft_a, stft_b = rearrange_many((stft_a, stft_b), "b (c f) l -> b c f l", f=f)
+        return self.decode(stft_a, stft_b)

src/YingMusicSinger/utils/stable_audio_tools/autoencoders.py ADDED Viewed

	@@ -0,0 +1,975 @@

+import math
+from typing import Any, Dict, Literal
+import numpy as np
+import torch
+from alias_free_torch import Activation1d
+from dac.nn.layers import WNConv1d, WNConvTranspose1d
+from torch import nn
+from torch.nn import functional as F
+from torchaudio import transforms as T
+# from ..inference.sampling import sample
+# from ..inference.utils import prepare_audio
+from .blocks import SnakeBeta
+from .bottleneck import Bottleneck, DiscreteBottleneck
+from .diffusion import (
+    ConditionedDiffusionModel,
+    DAU1DCondWrapper,
+    DiTWrapper,
+    UNet1DCondWrapper,
+)
+from .factory import create_bottleneck_from_config, create_pretransform_from_config
+from .pretransforms import Pretransform
+def checkpoint(function, *args, **kwargs):
+    kwargs.setdefault("use_reentrant", False)
+    return torch.utils.checkpoint.checkpoint(function, *args, **kwargs)
+def get_activation(
+    activation: Literal["elu", "snake", "none"], antialias=False, channels=None
+) -> nn.Module:
+    if activation == "elu":
+        act = nn.ELU()
+    elif activation == "snake":
+        act = SnakeBeta(channels)
+    elif activation == "none":
+        act = nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {activation}")
+    if antialias:
+        act = Activation1d(act)
+    return act
+class ResidualUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dilation,
+        use_snake=False,
+        antialias_activation=False,
+    ):
+        super().__init__()
+        self.dilation = dilation
+        padding = (dilation * (7 - 1)) // 2
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels,
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                dilation=dilation,
+                padding=padding,
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels,
+            ),
+            WNConv1d(
+                in_channels=out_channels, out_channels=out_channels, kernel_size=1
+            ),
+        )
+    def forward(self, x):
+        res = x
+        # x = checkpoint(self.layers, x)
+        x = self.layers(x)
+        return x + res
+class EncoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False,
+    ):
+        super().__init__()
+        self.layers = nn.Sequential(
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=1,
+                use_snake=use_snake,
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=3,
+                use_snake=use_snake,
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=9,
+                use_snake=use_snake,
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels,
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False,
+    ):
+        super().__init__()
+        if use_nearest_upsample:
+            upsample_layer = nn.Sequential(
+                nn.Upsample(scale_factor=stride, mode="nearest"),
+                WNConv1d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=2 * stride,
+                    stride=1,
+                    bias=False,
+                    padding="same",
+                ),
+            )
+        else:
+            upsample_layer = WNConvTranspose1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            )
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels,
+            ),
+            upsample_layer,
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=1,
+                use_snake=use_snake,
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=3,
+                use_snake=use_snake,
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=9,
+                use_snake=use_snake,
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class OobleckEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False,
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=c_mults[0] * channels,
+                kernel_size=7,
+                padding=3,
+            )
+        ]
+        for i in range(self.depth - 1):
+            layers += [
+                EncoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i + 1] * channels,
+                    stride=strides[i],
+                    use_snake=use_snake,
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[-1] * channels,
+            ),
+            WNConv1d(
+                in_channels=c_mults[-1] * channels,
+                out_channels=latent_dim,
+                kernel_size=3,
+                padding=1,
+            ),
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class OobleckDecoder(nn.Module):
+    def __init__(
+        self,
+        out_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False,
+        final_tanh=True,
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=latent_dim,
+                out_channels=c_mults[-1] * channels,
+                kernel_size=7,
+                padding=3,
+            ),
+        ]
+        for i in range(self.depth - 1, 0, -1):
+            layers += [
+                DecoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i - 1] * channels,
+                    stride=strides[i - 1],
+                    use_snake=use_snake,
+                    antialias_activation=antialias_activation,
+                    use_nearest_upsample=use_nearest_upsample,
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[0] * channels,
+            ),
+            WNConv1d(
+                in_channels=c_mults[0] * channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                padding=3,
+                bias=False,
+            ),
+            nn.Tanh() if final_tanh else nn.Identity(),
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class DACEncoderWrapper(nn.Module):
+    def __init__(self, in_channels=1, **kwargs):
+        super().__init__()
+        from dac.model.dac import Encoder as DACEncoder
+        latent_dim = kwargs.pop("latent_dim", None)
+        encoder_out_dim = kwargs["d_model"] * (2 ** len(kwargs["strides"]))
+        self.encoder = DACEncoder(d_latent=encoder_out_dim, **kwargs)
+        self.latent_dim = latent_dim
+        # Latent-dim support was added to DAC after this was first written, and implemented differently, so this is for backwards compatibility
+        self.proj_out = (
+            nn.Conv1d(self.encoder.enc_dim, latent_dim, kernel_size=1)
+            if latent_dim is not None
+            else nn.Identity()
+        )
+        if in_channels != 1:
+            self.encoder.block[0] = WNConv1d(
+                in_channels, kwargs.get("d_model", 64), kernel_size=7, padding=3
+            )
+    def forward(self, x):
+        x = self.encoder(x)
+        x = self.proj_out(x)
+        return x
+class DACDecoderWrapper(nn.Module):
+    def __init__(self, latent_dim, out_channels=1, **kwargs):
+        super().__init__()
+        from dac.model.dac import Decoder as DACDecoder
+        self.decoder = DACDecoder(
+            **kwargs, input_channel=latent_dim, d_out=out_channels
+        )
+        self.latent_dim = latent_dim
+    def forward(self, x):
+        return self.decoder(x)
+class AudioAutoencoder(nn.Module):
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        latent_dim,
+        downsampling_ratio,
+        sample_rate,
+        io_channels=2,
+        bottleneck: Bottleneck = None,
+        pretransform: Pretransform = None,
+        in_channels=None,
+        out_channels=None,
+        soft_clip=False,
+    ):
+        super().__init__()
+        self.downsampling_ratio = downsampling_ratio
+        self.sample_rate = sample_rate
+        self.latent_dim = latent_dim
+        self.io_channels = io_channels
+        self.in_channels = io_channels
+        self.out_channels = io_channels
+        self.min_length = self.downsampling_ratio
+        if in_channels is not None:
+            self.in_channels = in_channels
+        if out_channels is not None:
+            self.out_channels = out_channels
+        self.bottleneck = bottleneck
+        self.encoder = encoder
+        self.decoder = decoder
+        self.pretransform = pretransform
+        self.soft_clip = soft_clip
+        self.is_discrete = self.bottleneck is not None and self.bottleneck.is_discrete
+    def encode(
+        self,
+        audio,
+        return_info=False,
+        skip_pretransform=False,
+        iterate_batch=False,
+        **kwargs,
+    ):
+        info = {}
+        if self.pretransform is not None and not skip_pretransform:
+            if self.pretransform.enable_grad:
+                if iterate_batch:
+                    audios = []
+                    for i in range(audio.shape[0]):
+                        audios.append(self.pretransform.encode(audio[i : i + 1]))
+                    audio = torch.cat(audios, dim=0)
+                else:
+                    audio = self.pretransform.encode(audio)
+            else:
+                with torch.no_grad():
+                    if iterate_batch:
+                        audios = []
+                        for i in range(audio.shape[0]):
+                            audios.append(self.pretransform.encode(audio[i : i + 1]))
+                        audio = torch.cat(audios, dim=0)
+                    else:
+                        audio = self.pretransform.encode(audio)
+        if self.encoder is not None:
+            if iterate_batch:
+                latents = []
+                for i in range(audio.shape[0]):
+                    latents.append(self.encoder(audio[i : i + 1]))
+                latents = torch.cat(latents, dim=0)
+            else:
+                latents = self.encoder(audio)
+        else:
+            latents = audio
+        if self.bottleneck is not None:
+            # TODO: Add iterate batch logic, needs to merge the info dicts
+            latents, bottleneck_info = self.bottleneck.encode(
+                latents, return_info=True, **kwargs
+            )
+            info.update(bottleneck_info)
+        if return_info:
+            return latents, info
+        return latents
+    def decode(self, latents, iterate_batch=False, **kwargs):
+        if self.bottleneck is not None:
+            if iterate_batch:
+                decoded = []
+                for i in range(latents.shape[0]):
+                    decoded.append(self.bottleneck.decode(latents[i : i + 1]))
+                latents = torch.cat(decoded, dim=0)
+            else:
+                latents = self.bottleneck.decode(latents)
+        if iterate_batch:
+            decoded = []
+            for i in range(latents.shape[0]):
+                decoded.append(self.decoder(latents[i : i + 1]))
+            decoded = torch.cat(decoded, dim=0)
+        else:
+            decoded = self.decoder(latents, **kwargs)
+        if self.pretransform is not None:
+            if self.pretransform.enable_grad:
+                if iterate_batch:
+                    decodeds = []
+                    for i in range(decoded.shape[0]):
+                        decodeds.append(self.pretransform.decode(decoded[i : i + 1]))
+                    decoded = torch.cat(decodeds, dim=0)
+                else:
+                    decoded = self.pretransform.decode(decoded)
+            else:
+                with torch.no_grad():
+                    if iterate_batch:
+                        decodeds = []
+                        for i in range(latents.shape[0]):
+                            decodeds.append(
+                                self.pretransform.decode(decoded[i : i + 1])
+                            )
+                        decoded = torch.cat(decodeds, dim=0)
+                    else:
+                        decoded = self.pretransform.decode(decoded)
+        if self.soft_clip:
+            decoded = torch.tanh(decoded)
+        return decoded
+    def decode_tokens(self, tokens, **kwargs):
+        """
+        Decode discrete tokens to audio
+        Only works with discrete autoencoders
+        """
+        assert isinstance(self.bottleneck, DiscreteBottleneck), (
+            "decode_tokens only works with discrete autoencoders"
+        )
+        latents = self.bottleneck.decode_tokens(tokens, **kwargs)
+        return self.decode(latents, **kwargs)
+    def preprocess_audio_for_encoder(self, audio, in_sr):
+        """
+        Preprocess single audio tensor (Channels x Length) to be compatible with the encoder.
+        If the model is mono, stereo audio will be converted to mono.
+        Audio will be silence-padded to be a multiple of the model's downsampling ratio.
+        Audio will be resampled to the model's sample rate.
+        The output will have batch size 1 and be shape (1 x Channels x Length)
+        """
+        return self.preprocess_audio_list_for_encoder([audio], [in_sr])
+    def preprocess_audio_list_for_encoder(self, audio_list, in_sr_list):
+        """
+        Preprocess a [list] of audio (Channels x Length) into a batch tensor to be compatable with the encoder.
+        The audio in that list can be of different lengths and channels.
+        in_sr can be an integer or list. If it's an integer it will be assumed it is the input sample_rate for every audio.
+        All audio will be resampled to the model's sample rate.
+        Audio will be silence-padded to the longest length, and further padded to be a multiple of the model's downsampling ratio.
+        If the model is mono, all audio will be converted to mono.
+        The output will be a tensor of shape (Batch x Channels x Length)
+        """
+        batch_size = len(audio_list)
+        if isinstance(in_sr_list, int):
+            in_sr_list = [in_sr_list] * batch_size
+        assert len(in_sr_list) == batch_size, (
+            "list of sample rates must be the same length of audio_list"
+        )
+        new_audio = []
+        max_length = 0
+        # resample & find the max length
+        for i in range(batch_size):
+            audio = audio_list[i]
+            in_sr = in_sr_list[i]
+            if len(audio.shape) == 3 and audio.shape[0] == 1:
+                # batchsize 1 was given by accident. Just squeeze it.
+                audio = audio.squeeze(0)
+            elif len(audio.shape) == 1:
+                # Mono signal, channel dimension is missing, unsqueeze it in
+                audio = audio.unsqueeze(0)
+            assert len(audio.shape) == 2, (
+                "Audio should be shape (Channels x Length) with no batch dimension"
+            )
+            # Resample audio
+            if in_sr != self.sample_rate:
+                resample_tf = T.Resample(in_sr, self.sample_rate).to(audio.device)
+                audio = resample_tf(audio)
+            new_audio.append(audio)
+            if audio.shape[-1] > max_length:
+                max_length = audio.shape[-1]
+        # Pad every audio to the same length, multiple of model's downsampling ratio
+        padded_audio_length = (
+            max_length
+            + (self.min_length - (max_length % self.min_length)) % self.min_length
+        )
+        for i in range(batch_size):
+            # Pad it & if necessary, mixdown/duplicate stereo/mono channels to support model
+            new_audio[i] = prepare_audio(
+                new_audio[i],
+                in_sr=in_sr,
+                target_sr=in_sr,
+                target_length=padded_audio_length,
+                target_channels=self.in_channels,
+                device=new_audio[i].device,
+            ).squeeze(0)
+        # convert to tensor
+        return torch.stack(new_audio)
+    def encode_audio(self, audio, chunked=False, overlap=32, chunk_size=128, **kwargs):
+        """
+        Encode audios into latents. Audios should already be preprocesed by preprocess_audio_for_encoder.
+        If chunked is True, split the audio into chunks of a given maximum size chunk_size, with given overlap.
+        Overlap and chunk_size params are both measured in number of latents (not audio samples)
+        # and therefore you likely could use the same values with decode_audio.
+        A overlap of zero will cause discontinuity artefacts. Overlap should be => receptive field size.
+        Every autoencoder will have a different receptive field size, and thus ideal overlap.
+        You can determine it empirically by diffing unchunked vs chunked output and looking at maximum diff.
+        The final chunk may have a longer overlap in order to keep chunk_size consistent for all chunks.
+        Smaller chunk_size uses less memory, but more compute.
+        The chunk_size vs memory tradeoff isn't linear, and possibly depends on the GPU and CUDA version
+        For example, on a A6000 chunk_size 128 is overall faster than 256 and 512 even though it has more chunks
+        """
+        if not chunked:
+            # default behavior. Encode the entire audio in parallel
+            return self.encode(audio, **kwargs)
+        else:
+            # CHUNKED ENCODING
+            # samples_per_latent is just the downsampling ratio (which is also the upsampling ratio)
+            samples_per_latent = self.downsampling_ratio
+            total_size = audio.shape[2]  # in samples
+            batch_size = audio.shape[0]
+            chunk_size *= samples_per_latent  # converting metric in latents to samples
+            overlap *= samples_per_latent  # converting metric in latents to samples
+            hop_size = chunk_size - overlap
+            chunks = []
+            for i in range(0, total_size - chunk_size + 1, hop_size):
+                chunk = audio[:, :, i : i + chunk_size]
+                chunks.append(chunk)
+            if i + chunk_size != total_size:
+                # Final chunk
+                chunk = audio[:, :, -chunk_size:]
+                chunks.append(chunk)
+            chunks = torch.stack(chunks)
+            num_chunks = chunks.shape[0]
+            # Note: y_size might be a different value from the latent length used in diffusion training
+            # because we can encode audio of varying lengths
+            # However, the audio should've been padded to a multiple of samples_per_latent by now.
+            y_size = total_size // samples_per_latent
+            # Create an empty latent, we will populate it with chunks as we encode them
+            y_final = torch.zeros((batch_size, self.latent_dim, y_size)).to(
+                audio.device
+            )
+            for i in range(num_chunks):
+                x_chunk = chunks[i, :]
+                # encode the chunk
+                y_chunk = self.encode(x_chunk)
+                # figure out where to put the audio along the time domain
+                if i == num_chunks - 1:
+                    # final chunk always goes at the end
+                    t_end = y_size
+                    t_start = t_end - y_chunk.shape[2]
+                else:
+                    t_start = i * hop_size // samples_per_latent
+                    t_end = t_start + chunk_size // samples_per_latent
+                #  remove the edges of the overlaps
+                ol = overlap // samples_per_latent // 2
+                chunk_start = 0
+                chunk_end = y_chunk.shape[2]
+                if i > 0:
+                    # no overlap for the start of the first chunk
+                    t_start += ol
+                    chunk_start += ol
+                if i < num_chunks - 1:
+                    # no overlap for the end of the last chunk
+                    t_end -= ol
+                    chunk_end -= ol
+                # paste the chunked audio into our y_final output audio
+                y_final[:, :, t_start:t_end] = y_chunk[:, :, chunk_start:chunk_end]
+            return y_final
+    def decode_audio(
+        self, latents, chunked=False, overlap=32, chunk_size=128, **kwargs
+    ):
+        """
+        Decode latents to audio.
+        If chunked is True, split the latents into chunks of a given maximum size chunk_size, with given overlap, both of which are measured in number of latents.
+        A overlap of zero will cause discontinuity artefacts. Overlap should be => receptive field size.
+        Every autoencoder will have a different receptive field size, and thus ideal overlap.
+        You can determine it empirically by diffing unchunked vs chunked audio and looking at maximum diff.
+        The final chunk may have a longer overlap in order to keep chunk_size consistent for all chunks.
+        Smaller chunk_size uses less memory, but more compute.
+        The chunk_size vs memory tradeoff isn't linear, and possibly depends on the GPU and CUDA version
+        For example, on a A6000 chunk_size 128 is overall faster than 256 and 512 even though it has more chunks
+        """
+        if not chunked:
+            # default behavior. Decode the entire latent in parallel
+            return self.decode(latents, **kwargs)
+        else:
+            # chunked decoding
+            hop_size = chunk_size - overlap
+            total_size = latents.shape[2]
+            batch_size = latents.shape[0]
+            chunks = []
+            if total_size < chunk_size:
+                # pad the latents to be at least chunk_size
+                # 如果在这里pad之后，那么之后的生成歌曲就变噪音了
+                pad_size = chunk_size - total_size + 1
+                latents = F.pad(latents, (0, pad_size), mode="replicate")
+                total_size = latents.shape[2]
+            # import pdb; pdb.set_trace()
+            for i in range(0, total_size - chunk_size + 1, hop_size):
+                chunk = latents[:, :, i : i + chunk_size]
+                chunks.append(chunk)
+            if i + chunk_size != total_size:
+                # Final chunk
+                chunk = latents[:, :, -chunk_size:]
+                chunks.append(chunk)
+            chunks = torch.stack(chunks)
+            num_chunks = chunks.shape[0]
+            # samples_per_latent is just the downsampling ratio
+            samples_per_latent = self.downsampling_ratio
+            # Create an empty waveform, we will populate it with chunks as decode them
+            y_size = total_size * samples_per_latent
+            y_final = torch.zeros((batch_size, self.out_channels, y_size)).to(
+                latents.device
+            )
+            for i in range(num_chunks):
+                x_chunk = chunks[i, :]
+                # decode the chunk
+                y_chunk = self.decode(x_chunk)
+                # figure out where to put the audio along the time domain
+                if i == num_chunks - 1:
+                    # final chunk always goes at the end
+                    t_end = y_size
+                    t_start = t_end - y_chunk.shape[2]
+                else:
+                    t_start = i * hop_size * samples_per_latent
+                    t_end = t_start + chunk_size * samples_per_latent
+                #  remove the edges of the overlaps
+                ol = (overlap // 2) * samples_per_latent
+                chunk_start = 0
+                chunk_end = y_chunk.shape[2]
+                if i > 0:
+                    # no overlap for the start of the first chunk
+                    t_start += ol
+                    chunk_start += ol
+                if i < num_chunks - 1:
+                    # no overlap for the end of the last chunk
+                    t_end -= ol
+                    chunk_end -= ol
+                # paste the chunked audio into our y_final output audio
+                y_final[:, :, t_start:t_end] = y_chunk[:, :, chunk_start:chunk_end]
+            return y_final
+class DiffusionAutoencoder(AudioAutoencoder):
+    def __init__(
+        self,
+        diffusion: ConditionedDiffusionModel,
+        diffusion_downsampling_ratio,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.diffusion = diffusion
+        self.min_length = self.downsampling_ratio * diffusion_downsampling_ratio
+        if self.encoder is not None:
+            # Shrink the initial encoder parameters to avoid saturated latents
+            with torch.no_grad():
+                for param in self.encoder.parameters():
+                    param *= 0.5
+    def decode(self, latents, steps=100):
+        upsampled_length = latents.shape[2] * self.downsampling_ratio
+        if self.bottleneck is not None:
+            latents = self.bottleneck.decode(latents)
+        if self.decoder is not None:
+            latents = self.decode(latents)
+        # Upsample latents to match diffusion length
+        if latents.shape[2] != upsampled_length:
+            latents = F.interpolate(latents, size=upsampled_length, mode="nearest")
+        noise = torch.randn(
+            latents.shape[0], self.io_channels, upsampled_length, device=latents.device
+        )
+        decoded = sample(self.diffusion, noise, steps, 0, input_concat_cond=latents)
+        if self.pretransform is not None:
+            if self.pretransform.enable_grad:
+                decoded = self.pretransform.decode(decoded)
+            else:
+                with torch.no_grad():
+                    decoded = self.pretransform.decode(decoded)
+        return decoded
+# AE factories
+def create_encoder_from_config(encoder_config: Dict[str, Any]):
+    encoder_type = encoder_config.get("type", None)
+    assert encoder_type is not None, "Encoder type must be specified"
+    if encoder_type == "oobleck":
+        encoder = OobleckEncoder(**encoder_config["config"])
+    elif encoder_type == "seanet":
+        from encodec.modules import SEANetEncoder
+        seanet_encoder_config = encoder_config["config"]
+        # SEANet encoder expects strides in reverse order
+        seanet_encoder_config["ratios"] = list(
+            reversed(seanet_encoder_config.get("ratios", [2, 2, 2, 2, 2]))
+        )
+        encoder = SEANetEncoder(**seanet_encoder_config)
+    elif encoder_type == "dac":
+        dac_config = encoder_config["config"]
+        encoder = DACEncoderWrapper(**dac_config)
+    elif encoder_type == "local_attn":
+        from .local_attention import TransformerEncoder1D
+        local_attn_config = encoder_config["config"]
+        encoder = TransformerEncoder1D(**local_attn_config)
+    else:
+        raise ValueError(f"Unknown encoder type {encoder_type}")
+    requires_grad = encoder_config.get("requires_grad", True)
+    if not requires_grad:
+        for param in encoder.parameters():
+            param.requires_grad = False
+    return encoder
+def create_decoder_from_config(decoder_config: Dict[str, Any]):
+    decoder_type = decoder_config.get("type", None)
+    assert decoder_type is not None, "Decoder type must be specified"
+    if decoder_type == "oobleck":
+        decoder = OobleckDecoder(**decoder_config["config"])
+    elif decoder_type == "seanet":
+        from encodec.modules import SEANetDecoder
+        decoder = SEANetDecoder(**decoder_config["config"])
+    elif decoder_type == "dac":
+        dac_config = decoder_config["config"]
+        decoder = DACDecoderWrapper(**dac_config)
+    elif decoder_type == "local_attn":
+        from .local_attention import TransformerDecoder1D
+        local_attn_config = decoder_config["config"]
+        decoder = TransformerDecoder1D(**local_attn_config)
+    else:
+        raise ValueError(f"Unknown decoder type {decoder_type}")
+    requires_grad = decoder_config.get("requires_grad", True)
+    if not requires_grad:
+        for param in decoder.parameters():
+            param.requires_grad = False
+    return decoder
+def create_autoencoder_from_config(config: Dict[str, Any]):
+    ae_config = config["model"]
+    encoder = create_encoder_from_config(ae_config["encoder"])
+    decoder = create_decoder_from_config(ae_config["decoder"])
+    bottleneck = ae_config.get("bottleneck", None)
+    latent_dim = ae_config.get("latent_dim", None)
+    assert latent_dim is not None, "latent_dim must be specified in model config"
+    downsampling_ratio = ae_config.get("downsampling_ratio", None)
+    assert downsampling_ratio is not None, (
+        "downsampling_ratio must be specified in model config"
+    )
+    io_channels = ae_config.get("io_channels", None)
+    assert io_channels is not None, "io_channels must be specified in model config"
+    sample_rate = config.get("sample_rate", None)
+    assert sample_rate is not None, "sample_rate must be specified in model config"
+    in_channels = ae_config.get("in_channels", None)
+    out_channels = ae_config.get("out_channels", None)
+    pretransform = ae_config.get("pretransform", None)
+    if pretransform is not None:
+        pretransform = create_pretransform_from_config(pretransform, sample_rate)
+    if bottleneck is not None:
+        bottleneck = create_bottleneck_from_config(bottleneck)
+    soft_clip = ae_config["decoder"].get("soft_clip", False)
+    return AudioAutoencoder(
+        encoder,
+        decoder,
+        io_channels=io_channels,
+        latent_dim=latent_dim,
+        downsampling_ratio=downsampling_ratio,
+        sample_rate=sample_rate,
+        bottleneck=bottleneck,
+        pretransform=pretransform,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        soft_clip=soft_clip,
+    )
+def create_diffAE_from_config(config: Dict[str, Any]):
+    diffae_config = config["model"]
+    if "encoder" in diffae_config:
+        encoder = create_encoder_from_config(diffae_config["encoder"])
+    else:
+        encoder = None
+    if "decoder" in diffae_config:
+        decoder = create_decoder_from_config(diffae_config["decoder"])
+    else:
+        decoder = None
+    diffusion_model_type = diffae_config["diffusion"]["type"]
+    if diffusion_model_type == "DAU1d":
+        diffusion = DAU1DCondWrapper(**diffae_config["diffusion"]["config"])
+    elif diffusion_model_type == "adp_1d":
+        diffusion = UNet1DCondWrapper(**diffae_config["diffusion"]["config"])
+    elif diffusion_model_type == "dit":
+        diffusion = DiTWrapper(**diffae_config["diffusion"]["config"])
+    latent_dim = diffae_config.get("latent_dim", None)
+    assert latent_dim is not None, "latent_dim must be specified in model config"
+    downsampling_ratio = diffae_config.get("downsampling_ratio", None)
+    assert downsampling_ratio is not None, (
+        "downsampling_ratio must be specified in model config"
+    )
+    io_channels = diffae_config.get("io_channels", None)
+    assert io_channels is not None, "io_channels must be specified in model config"
+    sample_rate = config.get("sample_rate", None)
+    assert sample_rate is not None, "sample_rate must be specified in model config"
+    bottleneck = diffae_config.get("bottleneck", None)
+    pretransform = diffae_config.get("pretransform", None)
+    if pretransform is not None:
+        pretransform = create_pretransform_from_config(pretransform, sample_rate)
+    if bottleneck is not None:
+        bottleneck = create_bottleneck_from_config(bottleneck)
+    diffusion_downsampling_ratio = (None,)
+    if diffusion_model_type == "DAU1d":
+        diffusion_downsampling_ratio = np.prod(
+            diffae_config["diffusion"]["config"]["strides"]
+        )
+    elif diffusion_model_type == "adp_1d":
+        diffusion_downsampling_ratio = np.prod(
+            diffae_config["diffusion"]["config"]["factors"]
+        )
+    elif diffusion_model_type == "dit":
+        diffusion_downsampling_ratio = 1
+    return DiffusionAutoencoder(
+        encoder=encoder,
+        decoder=decoder,
+        diffusion=diffusion,
+        io_channels=io_channels,
+        sample_rate=sample_rate,
+        latent_dim=latent_dim,
+        downsampling_ratio=downsampling_ratio,
+        diffusion_downsampling_ratio=diffusion_downsampling_ratio,
+        bottleneck=bottleneck,
+        pretransform=pretransform,
+    )

src/YingMusicSinger/utils/stable_audio_tools/blocks.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import math
+from functools import reduce
+import numpy as np
+import torch
+from dac.nn.layers import Snake1d
+from packaging import version
+from torch import nn
+from torch.backends.cuda import sdp_kernel
+from torch.nn import functional as F
+class ResidualBlock(nn.Module):
+    def __init__(self, main, skip=None):
+        super().__init__()
+        self.main = nn.Sequential(*main)
+        self.skip = skip if skip else nn.Identity()
+    def forward(self, input):
+        return self.main(input) + self.skip(input)
+class ResConvBlock(ResidualBlock):
+    def __init__(
+        self,
+        c_in,
+        c_mid,
+        c_out,
+        is_last=False,
+        kernel_size=5,
+        conv_bias=True,
+        use_snake=False,
+    ):
+        skip = None if c_in == c_out else nn.Conv1d(c_in, c_out, 1, bias=False)
+        super().__init__(
+            [
+                nn.Conv1d(
+                    c_in, c_mid, kernel_size, padding=kernel_size // 2, bias=conv_bias
+                ),
+                nn.GroupNorm(1, c_mid),
+                Snake1d(c_mid) if use_snake else nn.GELU(),
+                nn.Conv1d(
+                    c_mid, c_out, kernel_size, padding=kernel_size // 2, bias=conv_bias
+                ),
+                nn.GroupNorm(1, c_out) if not is_last else nn.Identity(),
+                (Snake1d(c_out) if use_snake else nn.GELU())
+                if not is_last
+                else nn.Identity(),
+            ],
+            skip,
+        )
+class SelfAttention1d(nn.Module):
+    def __init__(self, c_in, n_head=1, dropout_rate=0.0):
+        super().__init__()
+        assert c_in % n_head == 0
+        self.norm = nn.GroupNorm(1, c_in)
+        self.n_head = n_head
+        self.qkv_proj = nn.Conv1d(c_in, c_in * 3, 1)
+        self.out_proj = nn.Conv1d(c_in, c_in, 1)
+        self.dropout = nn.Dropout(dropout_rate, inplace=True)
+        self.use_flash = torch.cuda.is_available() and version.parse(
+            torch.__version__
+        ) >= version.parse("2.0.0")
+        if not self.use_flash:
+            return
+        device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
+        if device_properties.major == 8 and device_properties.minor == 0:
+            # Use flash attention for A100 GPUs
+            self.sdp_kernel_config = (True, False, False)
+        else:
+            # Don't use flash attention for other GPUs
+            self.sdp_kernel_config = (False, True, True)
+    def forward(self, input):
+        n, c, s = input.shape
+        qkv = self.qkv_proj(self.norm(input))
+        qkv = qkv.view([n, self.n_head * 3, c // self.n_head, s]).transpose(2, 3)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = k.shape[3] ** -0.25
+        if self.use_flash:
+            with sdp_kernel(*self.sdp_kernel_config):
+                y = (
+                    F.scaled_dot_product_attention(q, k, v, is_causal=False)
+                    .contiguous()
+                    .view([n, c, s])
+                )
+        else:
+            att = ((q * scale) @ (k.transpose(2, 3) * scale)).softmax(3)
+            y = (att @ v).transpose(2, 3).contiguous().view([n, c, s])
+        return input + self.dropout(self.out_proj(y))
+class SkipBlock(nn.Module):
+    def __init__(self, *main):
+        super().__init__()
+        self.main = nn.Sequential(*main)
+    def forward(self, input):
+        return torch.cat([self.main(input), input], dim=1)
+class FourierFeatures(nn.Module):
+    def __init__(self, in_features, out_features, std=1.0):
+        super().__init__()
+        assert out_features % 2 == 0
+        self.weight = nn.Parameter(torch.randn([out_features // 2, in_features]) * std)
+    def forward(self, input):
+        f = 2 * math.pi * input @ self.weight.T
+        return torch.cat([f.cos(), f.sin()], dim=-1)
+def expand_to_planes(input, shape):
+    return input[..., None].repeat([1, 1, shape[2]])
+_kernels = {
+    "linear": [1 / 8, 3 / 8, 3 / 8, 1 / 8],
+    "cubic": [
+        -0.01171875,
+        -0.03515625,
+        0.11328125,
+        0.43359375,
+        0.43359375,
+        0.11328125,
+        -0.03515625,
+        -0.01171875,
+    ],
+    "lanczos3": [
+        0.003689131001010537,
+        0.015056144446134567,
+        -0.03399861603975296,
+        -0.066637322306633,
+        0.13550527393817902,
+        0.44638532400131226,
+        0.44638532400131226,
+        0.13550527393817902,
+        -0.066637322306633,
+        -0.03399861603975296,
+        0.015056144446134567,
+        0.003689131001010537,
+    ],
+}
+class Downsample1d(nn.Module):
+    def __init__(self, kernel="linear", pad_mode="reflect", channels_last=False):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor(_kernels[kernel])
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+        self.channels_last = channels_last
+    def forward(self, x):
+        if self.channels_last:
+            x = x.permute(0, 2, 1)
+        x = F.pad(x, (self.pad,) * 2, self.pad_mode)
+        weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0]])
+        indices = torch.arange(x.shape[1], device=x.device)
+        weight[indices, indices] = self.kernel.to(weight)
+        x = F.conv1d(x, weight, stride=2)
+        if self.channels_last:
+            x = x.permute(0, 2, 1)
+        return x
+class Upsample1d(nn.Module):
+    def __init__(self, kernel="linear", pad_mode="reflect", channels_last=False):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor(_kernels[kernel]) * 2
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+        self.channels_last = channels_last
+    def forward(self, x):
+        if self.channels_last:
+            x = x.permute(0, 2, 1)
+        x = F.pad(x, ((self.pad + 1) // 2,) * 2, self.pad_mode)
+        weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0]])
+        indices = torch.arange(x.shape[1], device=x.device)
+        weight[indices, indices] = self.kernel.to(weight)
+        x = F.conv_transpose1d(x, weight, stride=2, padding=self.pad * 2 + 1)
+        if self.channels_last:
+            x = x.permute(0, 2, 1)
+        return x
+def Downsample1d_2(
+    in_channels: int, out_channels: int, factor: int, kernel_multiplier: int = 2
+) -> nn.Module:
+    assert kernel_multiplier % 2 == 0, "Kernel multiplier must be even"
+    return nn.Conv1d(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=factor * kernel_multiplier + 1,
+        stride=factor,
+        padding=factor * (kernel_multiplier // 2),
+    )
+def Upsample1d_2(
+    in_channels: int, out_channels: int, factor: int, use_nearest: bool = False
+) -> nn.Module:
+    if factor == 1:
+        return nn.Conv1d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=3, padding=1
+        )
+    if use_nearest:
+        return nn.Sequential(
+            nn.Upsample(scale_factor=factor, mode="nearest"),
+            nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=1,
+            ),
+        )
+    else:
+        return nn.ConvTranspose1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=factor * 2,
+            stride=factor,
+            padding=factor // 2 + factor % 2,
+            output_padding=factor % 2,
+        )
+def zero_init(layer):
+    nn.init.zeros_(layer.weight)
+    if layer.bias is not None:
+        nn.init.zeros_(layer.bias)
+    return layer
+def rms_norm(x, scale, eps):
+    dtype = reduce(torch.promote_types, (x.dtype, scale.dtype, torch.float32))
+    mean_sq = torch.mean(x.to(dtype) ** 2, dim=-1, keepdim=True)
+    scale = scale.to(dtype) * torch.rsqrt(mean_sq + eps)
+    return x * scale.to(x.dtype)
+# rms_norm = torch.compile(rms_norm)
+class AdaRMSNorm(nn.Module):
+    def __init__(self, features, cond_features, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        self.linear = zero_init(nn.Linear(cond_features, features, bias=False))
+    def extra_repr(self):
+        return f"eps={self.eps},"
+    def forward(self, x, cond):
+        return rms_norm(x, self.linear(cond)[:, None, :] + 1, self.eps)
+def normalize(x, eps=1e-4):
+    dim = list(range(1, x.ndim))
+    n = torch.linalg.vector_norm(x, dim=dim, keepdim=True)
+    alpha = np.sqrt(n.numel() / x.numel())
+    return x / torch.add(eps, n, alpha=alpha)
+class ForcedWNConv1d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1):
+        super().__init__()
+        self.weight = nn.Parameter(
+            torch.randn([out_channels, in_channels, kernel_size])
+        )
+    def forward(self, x):
+        if self.training:
+            with torch.no_grad():
+                self.weight.copy_(normalize(self.weight))
+        fan_in = self.weight[0].numel()
+        w = normalize(self.weight) / math.sqrt(fan_in)
+        return F.conv1d(x, w, padding="same")
+# Kernels
+use_compile = True
+def compile(function, *args, **kwargs):
+    if not use_compile:
+        return function
+    try:
+        return torch.compile(function, *args, **kwargs)
+    except RuntimeError:
+        return function
+@compile
+def linear_geglu(x, weight, bias=None):
+    x = x @ weight.mT
+    if bias is not None:
+        x = x + bias
+    x, gate = x.chunk(2, dim=-1)
+    return x * F.gelu(gate)
+@compile
+def rms_norm(x, scale, eps):
+    dtype = reduce(torch.promote_types, (x.dtype, scale.dtype, torch.float32))
+    mean_sq = torch.mean(x.to(dtype) ** 2, dim=-1, keepdim=True)
+    scale = scale.to(dtype) * torch.rsqrt(mean_sq + eps)
+    return x * scale.to(x.dtype)
+# Layers
+class LinearGEGLU(nn.Linear):
+    def __init__(self, in_features, out_features, bias=True):
+        super().__init__(in_features, out_features * 2, bias=bias)
+        self.out_features = out_features
+    def forward(self, x):
+        return linear_geglu(x, self.weight, self.bias)
+class RMSNorm(nn.Module):
+    def __init__(self, shape, fix_scale=False, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+        if fix_scale:
+            self.register_buffer("scale", torch.ones(shape))
+        else:
+            self.scale = nn.Parameter(torch.ones(shape))
+    def extra_repr(self):
+        return f"shape={tuple(self.scale.shape)}, eps={self.eps}"
+    def forward(self, x):
+        return rms_norm(x, self.scale, self.eps)
+def snake_beta(x, alpha, beta):
+    return x + (1.0 / (beta + 0.000000001)) * pow(torch.sin(x * alpha), 2)
+# try:
+#     snake_beta = torch.compile(snake_beta)
+# except RuntimeError:
+#     pass
+# Adapted from https://github.com/NVIDIA/BigVGAN/blob/main/activations.py under MIT license
+# License available in LICENSES/LICENSE_NVIDIA.txt
+class SnakeBeta(nn.Module):
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True
+    ):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = snake_beta(x, alpha, beta)
+        return x

src/YingMusicSinger/utils/stable_audio_tools/bottleneck copy.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import numpy as np
+import torch
+from dac.nn.quantize import ResidualVectorQuantize as DACResidualVQ
+from einops import rearrange
+from torch import nn
+from torch.nn import functional as F
+from vector_quantize_pytorch import FSQ, ResidualVQ
+class Bottleneck(nn.Module):
+    def __init__(self, is_discrete: bool = False):
+        super().__init__()
+        self.is_discrete = is_discrete
+    def encode(self, x, return_info=False, **kwargs):
+        raise NotImplementedError
+    def decode(self, x):
+        raise NotImplementedError
+class DiscreteBottleneck(Bottleneck):
+    def __init__(self, num_quantizers, codebook_size, tokens_id):
+        super().__init__(is_discrete=True)
+        self.num_quantizers = num_quantizers
+        self.codebook_size = codebook_size
+        self.tokens_id = tokens_id
+    def decode_tokens(self, codes, **kwargs):
+        raise NotImplementedError
+class TanhBottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+        self.tanh = nn.Tanh()
+    def encode(self, x, return_info=False):
+        info = {}
+        x = torch.tanh(x)
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return x
+def vae_sample(mean, scale):
+    stdev = nn.functional.softplus(scale) + 1e-4
+    var = stdev * stdev
+    logvar = torch.log(var)
+    latents = torch.randn_like(mean) * stdev + mean
+    kl = (mean * mean + var - logvar - 1).sum(1).mean()
+    return latents, kl
+class VAEBottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+    def encode(self, x, return_info=False, **kwargs):
+        info = {}
+        mean, scale = x.chunk(2, dim=1)
+        x, kl = vae_sample(mean, scale)
+        info["kl"] = kl
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return x
+def compute_mean_kernel(x, y):
+    kernel_input = (x[:, None] - y[None]).pow(2).mean(2) / x.shape[-1]
+    return torch.exp(-kernel_input).mean()
+def compute_mmd(latents):
+    latents_reshaped = latents.permute(0, 2, 1).reshape(-1, latents.shape[1])
+    noise = torch.randn_like(latents_reshaped)
+    latents_kernel = compute_mean_kernel(latents_reshaped, latents_reshaped)
+    noise_kernel = compute_mean_kernel(noise, noise)
+    latents_noise_kernel = compute_mean_kernel(latents_reshaped, noise)
+    mmd = latents_kernel + noise_kernel - 2 * latents_noise_kernel
+    return mmd.mean()
+class WassersteinBottleneck(Bottleneck):
+    def __init__(self, noise_augment_dim: int = 0, bypass_mmd: bool = False):
+        super().__init__(is_discrete=False)
+        self.noise_augment_dim = noise_augment_dim
+        self.bypass_mmd = bypass_mmd
+    def encode(self, x, return_info=False):
+        info = {}
+        if self.training and return_info:
+            if self.bypass_mmd:
+                mmd = torch.tensor(0.0)
+            else:
+                mmd = compute_mmd(x)
+            info["mmd"] = mmd
+        if return_info:
+            return x, info
+        return x
+    def decode(self, x):
+        if self.noise_augment_dim > 0:
+            noise = torch.randn(
+                x.shape[0], self.noise_augment_dim, x.shape[-1]
+            ).type_as(x)
+            x = torch.cat([x, noise], dim=1)
+        return x
+class L2Bottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+    def encode(self, x, return_info=False):
+        info = {}
+        x = F.normalize(x, dim=1)
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return F.normalize(x, dim=1)
+class RVQBottleneck(DiscreteBottleneck):
+    def __init__(self, **quantizer_kwargs):
+        super().__init__(
+            num_quantizers=quantizer_kwargs["num_quantizers"],
+            codebook_size=quantizer_kwargs["codebook_size"],
+            tokens_id="quantizer_indices",
+        )
+        self.quantizer = ResidualVQ(**quantizer_kwargs)
+        self.num_quantizers = quantizer_kwargs["num_quantizers"]
+    def encode(self, x, return_info=False, **kwargs):
+        info = {}
+        x = rearrange(x, "b c n -> b n c")
+        x, indices, loss = self.quantizer(x)
+        x = rearrange(x, "b n c -> b c n")
+        info["quantizer_indices"] = indices
+        info["quantizer_loss"] = loss.mean()
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return x
+    def decode_tokens(self, codes, **kwargs):
+        latents = self.quantizer.get_outputs_from_indices(codes)
+        return self.decode(latents, **kwargs)
+class RVQVAEBottleneck(DiscreteBottleneck):
+    def __init__(self, **quantizer_kwargs):
+        super().__init__(
+            num_quantizers=quantizer_kwargs["num_quantizers"],
+            codebook_size=quantizer_kwargs["codebook_size"],
+            tokens_id="quantizer_indices",
+        )
+        self.quantizer = ResidualVQ(**quantizer_kwargs)
+        self.num_quantizers = quantizer_kwargs["num_quantizers"]
+    def encode(self, x, return_info=False):
+        info = {}
+        x, kl = vae_sample(*x.chunk(2, dim=1))
+        info["kl"] = kl
+        x = rearrange(x, "b c n -> b n c")
+        x, indices, loss = self.quantizer(x)
+        x = rearrange(x, "b n c -> b c n")
+        info["quantizer_indices"] = indices
+        info["quantizer_loss"] = loss.mean()
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return x
+    def decode_tokens(self, codes, **kwargs):
+        latents = self.quantizer.get_outputs_from_indices(codes)
+        return self.decode(latents, **kwargs)
+class DACRVQBottleneck(DiscreteBottleneck):
+    def __init__(
+        self, quantize_on_decode=False, noise_augment_dim=0, **quantizer_kwargs
+    ):
+        super().__init__(
+            num_quantizers=quantizer_kwargs["n_codebooks"],
+            codebook_size=quantizer_kwargs["codebook_size"],
+            tokens_id="codes",
+        )
+        self.quantizer = DACResidualVQ(**quantizer_kwargs)
+        self.num_quantizers = quantizer_kwargs["n_codebooks"]
+        self.quantize_on_decode = quantize_on_decode
+        self.noise_augment_dim = noise_augment_dim
+    def encode(self, x, return_info=False, **kwargs):
+        info = {}
+        info["pre_quantizer"] = x
+        if self.quantize_on_decode:
+            return x, info if return_info else x
+        z, codes, latents, commitment_loss, codebook_loss = self.quantizer(x, **kwargs)
+        output = {
+            "z": z,
+            "codes": codes,
+            "latents": latents,
+            "vq/commitment_loss": commitment_loss,
+            "vq/codebook_loss": codebook_loss,
+        }
+        output["vq/commitment_loss"] /= self.num_quantizers
+        output["vq/codebook_loss"] /= self.num_quantizers
+        info.update(output)
+        if return_info:
+            return output["z"], info
+        return output["z"]
+    def decode(self, x):
+        if self.quantize_on_decode:
+            x = self.quantizer(x)[0]
+        if self.noise_augment_dim > 0:
+            noise = torch.randn(
+                x.shape[0], self.noise_augment_dim, x.shape[-1]
+            ).type_as(x)
+            x = torch.cat([x, noise], dim=1)
+        return x
+    def decode_tokens(self, codes, **kwargs):
+        latents, _, _ = self.quantizer.from_codes(codes)
+        return self.decode(latents, **kwargs)
+class DACRVQVAEBottleneck(DiscreteBottleneck):
+    def __init__(self, quantize_on_decode=False, **quantizer_kwargs):
+        super().__init__(
+            num_quantizers=quantizer_kwargs["n_codebooks"],
+            codebook_size=quantizer_kwargs["codebook_size"],
+            tokens_id="codes",
+        )
+        self.quantizer = DACResidualVQ(**quantizer_kwargs)
+        self.num_quantizers = quantizer_kwargs["n_codebooks"]
+        self.quantize_on_decode = quantize_on_decode
+    def encode(self, x, return_info=False, n_quantizers: int = None):
+        info = {}
+        mean, scale = x.chunk(2, dim=1)
+        x, kl = vae_sample(mean, scale)
+        info["pre_quantizer"] = x
+        info["kl"] = kl
+        if self.quantize_on_decode:
+            return x, info if return_info else x
+        z, codes, latents, commitment_loss, codebook_loss = self.quantizer(
+            x, n_quantizers=n_quantizers
+        )
+        output = {
+            "z": z,
+            "codes": codes,
+            "latents": latents,
+            "vq/commitment_loss": commitment_loss,
+            "vq/codebook_loss": codebook_loss,
+        }
+        output["vq/commitment_loss"] /= self.num_quantizers
+        output["vq/codebook_loss"] /= self.num_quantizers
+        info.update(output)
+        if return_info:
+            return output["z"], info
+        return output["z"]
+    def decode(self, x):
+        if self.quantize_on_decode:
+            x = self.quantizer(x)[0]
+        return x
+    def decode_tokens(self, codes, **kwargs):
+        latents, _, _ = self.quantizer.from_codes(codes)
+        return self.decode(latents, **kwargs)
+class FSQBottleneck(DiscreteBottleneck):
+    def __init__(self, noise_augment_dim=0, **kwargs):
+        super().__init__(
+            num_quantizers=kwargs.get("num_codebooks", 1),
+            codebook_size=np.prod(kwargs["levels"]),
+            tokens_id="quantizer_indices",
+        )
+        self.noise_augment_dim = noise_augment_dim
+        self.quantizer = FSQ(
+            **kwargs, allowed_dtypes=[torch.float16, torch.float32, torch.float64]
+        )
+    def encode(self, x, return_info=False):
+        info = {}
+        orig_dtype = x.dtype
+        x = x.float()
+        x = rearrange(x, "b c n -> b n c")
+        x, indices = self.quantizer(x)
+        x = rearrange(x, "b n c -> b c n")
+        x = x.to(orig_dtype)
+        # Reorder indices to match the expected format
+        indices = rearrange(indices, "b n q -> b q n")
+        info["quantizer_indices"] = indices
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        if self.noise_augment_dim > 0:
+            noise = torch.randn(
+                x.shape[0], self.noise_augment_dim, x.shape[-1]
+            ).type_as(x)
+            x = torch.cat([x, noise], dim=1)
+        return x
+    def decode_tokens(self, tokens, **kwargs):
+        latents = self.quantizer.indices_to_codes(tokens)
+        return self.decode(latents, **kwargs)

src/YingMusicSinger/utils/stable_audio_tools/bottleneck.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import numpy as np
+import torch
+from dac.nn.quantize import ResidualVectorQuantize as DACResidualVQ
+from einops import rearrange
+from torch import nn
+from torch.nn import functional as F
+from vector_quantize_pytorch import FSQ, ResidualVQ
+class Bottleneck(nn.Module):
+    def __init__(self, is_discrete: bool = False):
+        super().__init__()
+        self.is_discrete = is_discrete
+    def encode(self, x, return_info=False, **kwargs):
+        raise NotImplementedError
+    def decode(self, x):
+        raise NotImplementedError
+class DiscreteBottleneck(Bottleneck):
+    def __init__(self, num_quantizers, codebook_size, tokens_id):
+        super().__init__(is_discrete=True)
+        self.num_quantizers = num_quantizers
+        self.codebook_size = codebook_size
+        self.tokens_id = tokens_id
+    def decode_tokens(self, codes, **kwargs):
+        raise NotImplementedError
+class TanhBottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+        self.tanh = nn.Tanh()
+    def encode(self, x, return_info=False):
+        info = {}
+        x = torch.tanh(x)
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return x
+def vae_sample(mean, scale):
+    stdev = nn.functional.softplus(scale) + 1e-4
+    var = stdev * stdev
+    logvar = torch.log(var)
+    latents = torch.randn_like(mean) * stdev + mean
+    kl = (mean * mean + var - logvar - 1).sum(1).mean()
+    return latents, kl
+class VAEBottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+    def encode(self, x, return_info=False, **kwargs):
+        info = {}
+        mean, scale = x.chunk(2, dim=1)
+        x, kl = vae_sample(mean, scale)
+        info["kl"] = kl
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return x
+def compute_mean_kernel(x, y):
+    kernel_input = (x[:, None] - y[None]).pow(2).mean(2) / x.shape[-1]
+    return torch.exp(-kernel_input).mean()
+def compute_mmd(latents):
+    latents_reshaped = latents.permute(0, 2, 1).reshape(-1, latents.shape[1])
+    noise = torch.randn_like(latents_reshaped)
+    latents_kernel = compute_mean_kernel(latents_reshaped, latents_reshaped)
+    noise_kernel = compute_mean_kernel(noise, noise)
+    latents_noise_kernel = compute_mean_kernel(latents_reshaped, noise)
+    mmd = latents_kernel + noise_kernel - 2 * latents_noise_kernel
+    return mmd.mean()
+class WassersteinBottleneck(Bottleneck):
+    def __init__(self, noise_augment_dim: int = 0, bypass_mmd: bool = False):
+        super().__init__(is_discrete=False)
+        self.noise_augment_dim = noise_augment_dim
+        self.bypass_mmd = bypass_mmd
+    def encode(self, x, return_info=False):
+        info = {}
+        if self.training and return_info:
+            if self.bypass_mmd:
+                mmd = torch.tensor(0.0)
+            else:
+                mmd = compute_mmd(x)
+            info["mmd"] = mmd
+        if return_info:
+            return x, info
+        return x
+    def decode(self, x):
+        if self.noise_augment_dim > 0:
+            noise = torch.randn(
+                x.shape[0], self.noise_augment_dim, x.shape[-1]
+            ).type_as(x)
+            x = torch.cat([x, noise], dim=1)
+        return x
+class L2Bottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+    def encode(self, x, return_info=False):
+        info = {}
+        x = F.normalize(x, dim=1)
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return F.normalize(x, dim=1)
+class RVQBottleneck(DiscreteBottleneck):
+    def __init__(self, **quantizer_kwargs):
+        super().__init__(
+            num_quantizers=quantizer_kwargs["num_quantizers"],
+            codebook_size=quantizer_kwargs["codebook_size"],
+            tokens_id="quantizer_indices",
+        )
+        self.quantizer = ResidualVQ(**quantizer_kwargs)
+        self.num_quantizers = quantizer_kwargs["num_quantizers"]
+    def encode(self, x, return_info=False, **kwargs):
+        info = {}
+        x = rearrange(x, "b c n -> b n c")
+        x, indices, loss = self.quantizer(x)
+        x = rearrange(x, "b n c -> b c n")
+        info["quantizer_indices"] = indices
+        info["quantizer_loss"] = loss.mean()
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return x
+    def decode_tokens(self, codes, **kwargs):
+        latents = self.quantizer.get_outputs_from_indices(codes)
+        return self.decode(latents, **kwargs)
+class RVQVAEBottleneck(DiscreteBottleneck):
+    def __init__(self, **quantizer_kwargs):
+        super().__init__(
+            num_quantizers=quantizer_kwargs["num_quantizers"],
+            codebook_size=quantizer_kwargs["codebook_size"],
+            tokens_id="quantizer_indices",
+        )
+        self.quantizer = ResidualVQ(**quantizer_kwargs)
+        self.num_quantizers = quantizer_kwargs["num_quantizers"]
+    def encode(self, x, return_info=False):
+        info = {}
+        x, kl = vae_sample(*x.chunk(2, dim=1))
+        info["kl"] = kl
+        x = rearrange(x, "b c n -> b n c")
+        x, indices, loss = self.quantizer(x)
+        x = rearrange(x, "b n c -> b c n")
+        info["quantizer_indices"] = indices
+        info["quantizer_loss"] = loss.mean()
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        return x
+    def decode_tokens(self, codes, **kwargs):
+        latents = self.quantizer.get_outputs_from_indices(codes)
+        return self.decode(latents, **kwargs)
+class DACRVQBottleneck(DiscreteBottleneck):
+    def __init__(
+        self, quantize_on_decode=False, noise_augment_dim=0, **quantizer_kwargs
+    ):
+        super().__init__(
+            num_quantizers=quantizer_kwargs["n_codebooks"],
+            codebook_size=quantizer_kwargs["codebook_size"],
+            tokens_id="codes",
+        )
+        self.quantizer = DACResidualVQ(**quantizer_kwargs)
+        self.num_quantizers = quantizer_kwargs["n_codebooks"]
+        self.quantize_on_decode = quantize_on_decode
+        self.noise_augment_dim = noise_augment_dim
+    def encode(self, x, return_info=False, **kwargs):
+        info = {}
+        info["pre_quantizer"] = x
+        if self.quantize_on_decode:
+            return x, info if return_info else x
+        z, codes, latents, commitment_loss, codebook_loss = self.quantizer(x, **kwargs)
+        output = {
+            "z": z,
+            "codes": codes,
+            "latents": latents,
+            "vq/commitment_loss": commitment_loss,
+            "vq/codebook_loss": codebook_loss,
+        }
+        output["vq/commitment_loss"] /= self.num_quantizers
+        output["vq/codebook_loss"] /= self.num_quantizers
+        info.update(output)
+        if return_info:
+            return output["z"], info
+        return output["z"]
+    def decode(self, x):
+        if self.quantize_on_decode:
+            x = self.quantizer(x)[0]
+        if self.noise_augment_dim > 0:
+            noise = torch.randn(
+                x.shape[0], self.noise_augment_dim, x.shape[-1]
+            ).type_as(x)
+            x = torch.cat([x, noise], dim=1)
+        return x
+    def decode_tokens(self, codes, **kwargs):
+        latents, _, _ = self.quantizer.from_codes(codes)
+        return self.decode(latents, **kwargs)
+class DACRVQVAEBottleneck(DiscreteBottleneck):
+    def __init__(self, quantize_on_decode=False, **quantizer_kwargs):
+        super().__init__(
+            num_quantizers=quantizer_kwargs["n_codebooks"],
+            codebook_size=quantizer_kwargs["codebook_size"],
+            tokens_id="codes",
+        )
+        self.quantizer = DACResidualVQ(**quantizer_kwargs)
+        self.num_quantizers = quantizer_kwargs["n_codebooks"]
+        self.quantize_on_decode = quantize_on_decode
+    def encode(self, x, return_info=False, n_quantizers: int = None):
+        info = {}
+        mean, scale = x.chunk(2, dim=1)
+        x, kl = vae_sample(mean, scale)
+        info["pre_quantizer"] = x
+        info["kl"] = kl
+        if self.quantize_on_decode:
+            return x, info if return_info else x
+        z, codes, latents, commitment_loss, codebook_loss = self.quantizer(
+            x, n_quantizers=n_quantizers
+        )
+        output = {
+            "z": z,
+            "codes": codes,
+            "latents": latents,
+            "vq/commitment_loss": commitment_loss,
+            "vq/codebook_loss": codebook_loss,
+        }
+        output["vq/commitment_loss"] /= self.num_quantizers
+        output["vq/codebook_loss"] /= self.num_quantizers
+        info.update(output)
+        if return_info:
+            return output["z"], info
+        return output["z"]
+    def decode(self, x):
+        if self.quantize_on_decode:
+            x = self.quantizer(x)[0]
+        return x
+    def decode_tokens(self, codes, **kwargs):
+        latents, _, _ = self.quantizer.from_codes(codes)
+        return self.decode(latents, **kwargs)
+class FSQBottleneck(DiscreteBottleneck):
+    def __init__(self, noise_augment_dim=0, **kwargs):
+        super().__init__(
+            num_quantizers=kwargs.get("num_codebooks", 1),
+            codebook_size=np.prod(kwargs["levels"]),
+            tokens_id="quantizer_indices",
+        )
+        self.noise_augment_dim = noise_augment_dim
+        self.quantizer = FSQ(
+            **kwargs, allowed_dtypes=[torch.float16, torch.float32, torch.float64]
+        )
+    def encode(self, x, return_info=False):
+        info = {}
+        orig_dtype = x.dtype
+        x = x.float()
+        x = rearrange(x, "b c n -> b n c")
+        x, indices = self.quantizer(x)
+        x = rearrange(x, "b n c -> b c n")
+        x = x.to(orig_dtype)
+        # Reorder indices to match the expected format
+        indices = rearrange(indices, "b n q -> b q n")
+        info["quantizer_indices"] = indices
+        if return_info:
+            return x, info
+        else:
+            return x
+    def decode(self, x):
+        if self.noise_augment_dim > 0:
+            noise = torch.randn(
+                x.shape[0], self.noise_augment_dim, x.shape[-1]
+            ).type_as(x)
+            x = torch.cat([x, noise], dim=1)
+        return x
+    def decode_tokens(self, tokens, **kwargs):
+        latents = self.quantizer.indices_to_codes(tokens)
+        return self.decode(latents, **kwargs)

src/YingMusicSinger/utils/stable_audio_tools/conditioners.py ADDED Viewed

	@@ -0,0 +1,664 @@

+# Heavily influenced by https://github.com/facebookresearch/audiocraft/blob/main/audiocraft/modules/conditioners.py
+import gc
+import logging
+import string
+import typing as tp
+import warnings
+import torch
+from torch import nn
+from .adp import NumberEmbedder
+# from ..inference.utils import set_audio_channels
+from .factory import create_pretransform_from_config
+from .pretransforms import Pretransform
+# from ..training.utils import copy_state_dict
+from .utils import load_ckpt_state_dict
+class Conditioner(nn.Module):
+    def __init__(self, dim: int, output_dim: int, project_out: bool = False):
+        super().__init__()
+        self.dim = dim
+        self.output_dim = output_dim
+        self.proj_out = (
+            nn.Linear(dim, output_dim)
+            if (dim != output_dim or project_out)
+            else nn.Identity()
+        )
+    def forward(self, x: tp.Any) -> tp.Any:
+        raise NotImplementedError()
+class IntConditioner(Conditioner):
+    def __init__(self, output_dim: int, min_val: int = 0, max_val: int = 512):
+        super().__init__(output_dim, output_dim)
+        self.min_val = min_val
+        self.max_val = max_val
+        self.int_embedder = nn.Embedding(
+            max_val - min_val + 1, output_dim
+        ).requires_grad_(True)
+    def forward(self, ints: tp.List[int], device=None) -> tp.Any:
+        # self.int_embedder.to(device)
+        ints = torch.tensor(ints).to(device)
+        ints = ints.clamp(self.min_val, self.max_val)
+        int_embeds = self.int_embedder(ints).unsqueeze(1)
+        return [int_embeds, torch.ones(int_embeds.shape[0], 1).to(device)]
+class NumberConditioner(Conditioner):
+    """
+    Conditioner that takes a list of floats, normalizes them for a given range, and returns a list of embeddings
+    """
+    def __init__(self, output_dim: int, min_val: float = 0, max_val: float = 1):
+        super().__init__(output_dim, output_dim)
+        self.min_val = min_val
+        self.max_val = max_val
+        self.embedder = NumberEmbedder(features=output_dim)
+    def forward(self, floats: tp.List[float], device=None) -> tp.Any:
+        # Cast the inputs to floats
+        floats = [float(x) for x in floats]
+        floats = torch.tensor(floats).to(device)
+        floats = floats.clamp(self.min_val, self.max_val)
+        normalized_floats = (floats - self.min_val) / (self.max_val - self.min_val)
+        # Cast floats to same type as embedder
+        embedder_dtype = next(self.embedder.parameters()).dtype
+        normalized_floats = normalized_floats.to(embedder_dtype)
+        float_embeds = self.embedder(normalized_floats).unsqueeze(1)
+        return [float_embeds, torch.ones(float_embeds.shape[0], 1).to(device)]
+class CLAPTextConditioner(Conditioner):
+    def __init__(
+        self,
+        output_dim: int,
+        clap_ckpt_path,
+        use_text_features=False,
+        feature_layer_ix: int = -1,
+        audio_model_type="HTSAT-base",
+        enable_fusion=True,
+        project_out: bool = False,
+        finetune: bool = False,
+    ):
+        super().__init__(
+            768 if use_text_features else 512, output_dim, project_out=project_out
+        )
+        self.use_text_features = use_text_features
+        self.feature_layer_ix = feature_layer_ix
+        self.finetune = finetune
+        # Suppress logging from transformers
+        previous_level = logging.root.manager.disable
+        logging.disable(logging.ERROR)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            try:
+                import laion_clap
+                from laion_clap.clap_module.factory import (
+                    load_state_dict as clap_load_state_dict,
+                )
+                model = laion_clap.CLAP_Module(
+                    enable_fusion=enable_fusion, amodel=audio_model_type, device="cpu"
+                )
+                if self.finetune:
+                    self.model = model
+                else:
+                    self.__dict__["model"] = model
+                state_dict = clap_load_state_dict(clap_ckpt_path)
+                self.model.model.load_state_dict(state_dict, strict=False)
+                if self.finetune:
+                    self.model.model.text_branch.requires_grad_(True)
+                    self.model.model.text_branch.train()
+                else:
+                    self.model.model.text_branch.requires_grad_(False)
+                    self.model.model.text_branch.eval()
+            finally:
+                logging.disable(previous_level)
+        del self.model.model.audio_branch
+        gc.collect()
+        torch.cuda.empty_cache()
+    def get_clap_features(self, prompts, layer_ix=-2, device: tp.Any = "cuda"):
+        prompt_tokens = self.model.tokenizer(prompts)
+        attention_mask = prompt_tokens["attention_mask"].to(
+            device=device, non_blocking=True
+        )
+        prompt_features = self.model.model.text_branch(
+            input_ids=prompt_tokens["input_ids"].to(device=device, non_blocking=True),
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][layer_ix]
+        return prompt_features, attention_mask
+    def forward(self, texts: tp.List[str], device: tp.Any = "cuda") -> tp.Any:
+        self.model.to(device)
+        if self.use_text_features:
+            if len(texts) == 1:
+                text_features, text_attention_mask = self.get_clap_features(
+                    [texts[0], ""], layer_ix=self.feature_layer_ix, device=device
+                )
+                text_features = text_features[:1, ...]
+                text_attention_mask = text_attention_mask[:1, ...]
+            else:
+                text_features, text_attention_mask = self.get_clap_features(
+                    texts, layer_ix=self.feature_layer_ix, device=device
+                )
+            return [self.proj_out(text_features), text_attention_mask]
+        # Fix for CLAP bug when only one text is passed
+        if len(texts) == 1:
+            text_embedding = self.model.get_text_embedding(
+                [texts[0], ""], use_tensor=True
+            )[:1, ...]
+        else:
+            text_embedding = self.model.get_text_embedding(texts, use_tensor=True)
+        text_embedding = text_embedding.unsqueeze(1).to(device)
+        return [
+            self.proj_out(text_embedding),
+            torch.ones(text_embedding.shape[0], 1).to(device),
+        ]
+class CLAPAudioConditioner(Conditioner):
+    def __init__(
+        self,
+        output_dim: int,
+        clap_ckpt_path,
+        audio_model_type="HTSAT-base",
+        enable_fusion=True,
+        project_out: bool = False,
+    ):
+        super().__init__(512, output_dim, project_out=project_out)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Suppress logging from transformers
+        previous_level = logging.root.manager.disable
+        logging.disable(logging.ERROR)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            try:
+                import laion_clap
+                from laion_clap.clap_module.factory import (
+                    load_state_dict as clap_load_state_dict,
+                )
+                model = laion_clap.CLAP_Module(
+                    enable_fusion=enable_fusion, amodel=audio_model_type, device="cpu"
+                )
+                if self.finetune:
+                    self.model = model
+                else:
+                    self.__dict__["model"] = model
+                state_dict = clap_load_state_dict(clap_ckpt_path)
+                self.model.model.load_state_dict(state_dict, strict=False)
+                if self.finetune:
+                    self.model.model.audio_branch.requires_grad_(True)
+                    self.model.model.audio_branch.train()
+                else:
+                    self.model.model.audio_branch.requires_grad_(False)
+                    self.model.model.audio_branch.eval()
+            finally:
+                logging.disable(previous_level)
+        del self.model.model.text_branch
+        gc.collect()
+        torch.cuda.empty_cache()
+    def forward(
+        self,
+        audios: tp.Union[torch.Tensor, tp.List[torch.Tensor], tp.Tuple[torch.Tensor]],
+        device: tp.Any = "cuda",
+    ) -> tp.Any:
+        self.model.to(device)
+        if isinstance(audios, list) or isinstance(audios, tuple):
+            audios = torch.cat(audios, dim=0)
+        # Convert to mono
+        mono_audios = audios.mean(dim=1)
+        with torch.cuda.amp.autocast(enabled=False):
+            audio_embedding = self.model.get_audio_embedding_from_data(
+                mono_audios.float(), use_tensor=True
+            )
+        audio_embedding = audio_embedding.unsqueeze(1).to(device)
+        return [
+            self.proj_out(audio_embedding),
+            torch.ones(audio_embedding.shape[0], 1).to(device),
+        ]
+class T5Conditioner(Conditioner):
+    T5_MODELS = [
+        "t5-small",
+        "t5-base",
+        "t5-large",
+        "t5-3b",
+        "t5-11b",
+        "google/flan-t5-small",
+        "google/flan-t5-base",
+        "google/flan-t5-large",
+        "google/flan-t5-xl",
+        "google/flan-t5-xxl",
+    ]
+    T5_MODEL_DIMS = {
+        "t5-small": 512,
+        "t5-base": 768,
+        "t5-large": 1024,
+        "t5-3b": 1024,
+        "t5-11b": 1024,
+        "t5-xl": 2048,
+        "t5-xxl": 4096,
+        "google/flan-t5-small": 512,
+        "google/flan-t5-base": 768,
+        "google/flan-t5-large": 1024,
+        "google/flan-t5-3b": 1024,
+        "google/flan-t5-11b": 1024,
+        "google/flan-t5-xl": 2048,
+        "google/flan-t5-xxl": 4096,
+    }
+    def __init__(
+        self,
+        output_dim: int,
+        t5_model_name: str = "t5-base",
+        max_length: str = 128,
+        enable_grad: bool = False,
+        project_out: bool = False,
+    ):
+        assert t5_model_name in self.T5_MODELS, (
+            f"Unknown T5 model name: {t5_model_name}"
+        )
+        super().__init__(
+            self.T5_MODEL_DIMS[t5_model_name], output_dim, project_out=project_out
+        )
+        from transformers import AutoTokenizer, T5EncoderModel
+        self.max_length = max_length
+        self.enable_grad = enable_grad
+        # Suppress logging from transformers
+        previous_level = logging.root.manager.disable
+        logging.disable(logging.ERROR)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            try:
+                # self.tokenizer = T5Tokenizer.from_pretrained(t5_model_name, model_max_length = max_length)
+                # model = T5EncoderModel.from_pretrained(t5_model_name, max_length=max_length).train(enable_grad).requires_grad_(enable_grad)
+                self.tokenizer = AutoTokenizer.from_pretrained(t5_model_name)
+                model = (
+                    T5EncoderModel.from_pretrained(t5_model_name)
+                    .train(enable_grad)
+                    .requires_grad_(enable_grad)
+                    .to(torch.float16)
+                )
+            finally:
+                logging.disable(previous_level)
+        if self.enable_grad:
+            self.model = model
+        else:
+            self.__dict__["model"] = model
+    def forward(
+        self, texts: tp.List[str], device: tp.Union[torch.device, str]
+    ) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        self.model.to(device)
+        self.proj_out.to(device)
+        encoded = self.tokenizer(
+            texts,
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        input_ids = encoded["input_ids"].to(device)
+        attention_mask = encoded["attention_mask"].to(device).to(torch.bool)
+        self.model.eval()
+        with torch.cuda.amp.autocast(dtype=torch.float16) and torch.set_grad_enabled(
+            self.enable_grad
+        ):
+            embeddings = self.model(input_ids=input_ids, attention_mask=attention_mask)[
+                "last_hidden_state"
+            ]
+        embeddings = self.proj_out(embeddings.float())
+        embeddings = embeddings * attention_mask.unsqueeze(-1).float()
+        return embeddings, attention_mask
+class PhonemeConditioner(Conditioner):
+    """
+    A conditioner that turns text into phonemes and embeds them using a lookup table
+    Only works for English text
+    Args:
+        output_dim: the dimension of the output embeddings
+        max_length: the maximum number of phonemes to embed
+        project_out: whether to add another linear projection to the output embeddings
+    """
+    def __init__(
+        self,
+        output_dim: int,
+        max_length: int = 1024,
+        project_out: bool = False,
+    ):
+        super().__init__(output_dim, output_dim, project_out=project_out)
+        from g2p_en import G2p
+        self.max_length = max_length
+        self.g2p = G2p()
+        # Reserving 0 for padding, 1 for ignored
+        self.phoneme_embedder = nn.Embedding(len(self.g2p.phonemes) + 2, output_dim)
+    def forward(
+        self, texts: tp.List[str], device: tp.Union[torch.device, str]
+    ) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        self.phoneme_embedder.to(device)
+        self.proj_out.to(device)
+        batch_phonemes = [
+            self.g2p(text) for text in texts
+        ]  # shape [batch_size, length]
+        phoneme_ignore = [" ", *string.punctuation]
+        # Remove ignored phonemes and cut to max length
+        batch_phonemes = [
+            [p if p not in phoneme_ignore else "_" for p in phonemes]
+            for phonemes in batch_phonemes
+        ]
+        # Convert to ids
+        phoneme_ids = [
+            [self.g2p.p2idx[p] + 2 if p in self.g2p.p2idx else 1 for p in phonemes]
+            for phonemes in batch_phonemes
+        ]
+        # Pad to match longest and make a mask tensor for the padding
+        longest = max([len(ids) for ids in phoneme_ids])
+        phoneme_ids = [ids + [0] * (longest - len(ids)) for ids in phoneme_ids]
+        phoneme_ids = torch.tensor(phoneme_ids).to(device)
+        # Convert to embeddings
+        phoneme_embeds = self.phoneme_embedder(phoneme_ids)
+        phoneme_embeds = self.proj_out(phoneme_embeds)
+        return phoneme_embeds, torch.ones(
+            phoneme_embeds.shape[0], phoneme_embeds.shape[1]
+        ).to(device)
+class TokenizerLUTConditioner(Conditioner):
+    """
+    A conditioner that embeds text using a lookup table on a pretrained tokenizer's vocabulary
+    Args:
+        tokenizer_name: the name of the tokenizer from the Hugging Face transformers library
+        output_dim: the dimension of the output embeddings
+        max_length: the maximum length of the text to embed
+        project_out: whether to add another linear projection to the output embeddings
+    """
+    def __init__(
+        self,
+        tokenizer_name: str,  # Name of a tokenizer from the Hugging Face transformers library
+        output_dim: int,
+        max_length: int = 1024,
+        project_out: bool = False,
+    ):
+        super().__init__(output_dim, output_dim, project_out=project_out)
+        from transformers import AutoTokenizer
+        # Suppress logging from transformers
+        previous_level = logging.root.manager.disable
+        logging.disable(logging.ERROR)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            try:
+                self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+            finally:
+                logging.disable(previous_level)
+        self.max_length = max_length
+        self.token_embedder = nn.Embedding(len(self.tokenizer), output_dim)
+    def forward(
+        self, texts: tp.List[str], device: tp.Union[torch.device, str]
+    ) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        self.proj_out.to(device)
+        encoded = self.tokenizer(
+            texts,
+            truncation=True,
+            max_length=self.max_length,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        input_ids = encoded["input_ids"].to(device)
+        attention_mask = encoded["attention_mask"].to(device).to(torch.bool)
+        embeddings = self.token_embedder(input_ids)
+        embeddings = self.proj_out(embeddings)
+        embeddings = embeddings * attention_mask.unsqueeze(-1).float()
+        return embeddings, attention_mask
+class PretransformConditioner(Conditioner):
+    """
+    A conditioner that uses a pretransform's encoder for conditioning
+    Args:
+        pretransform: an instantiated pretransform to use for conditioning
+        output_dim: the dimension of the output embeddings
+    """
+    def __init__(self, pretransform: Pretransform, output_dim: int):
+        super().__init__(pretransform.encoded_channels, output_dim)
+        self.pretransform = pretransform
+    def forward(
+        self,
+        audio: tp.Union[torch.Tensor, tp.List[torch.Tensor], tp.Tuple[torch.Tensor]],
+        device: tp.Union[torch.device, str],
+    ) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+        self.pretransform.to(device)
+        self.proj_out.to(device)
+        if isinstance(audio, list) or isinstance(audio, tuple):
+            audio = torch.cat(audio, dim=0)
+        # Convert audio to pretransform input channels
+        audio = set_audio_channels(audio, self.pretransform.io_channels)
+        latents = self.pretransform.encode(audio)
+        latents = self.proj_out(latents)
+        return [
+            latents,
+            torch.ones(latents.shape[0], latents.shape[2]).to(latents.device),
+        ]
+class MultiConditioner(nn.Module):
+    """
+    A module that applies multiple conditioners to an input dictionary based on the keys
+    Args:
+        conditioners: a dictionary of conditioners with keys corresponding to the keys of the conditioning input dictionary (e.g. "prompt")
+        default_keys: a dictionary of default keys to use if the key is not in the input dictionary (e.g. {"prompt_t5": "prompt"})
+    """
+    def __init__(
+        self,
+        conditioners: tp.Dict[str, Conditioner],
+        default_keys: tp.Dict[str, str] = {},
+    ):
+        super().__init__()
+        self.conditioners = nn.ModuleDict(conditioners)
+        self.default_keys = default_keys
+    def forward(
+        self,
+        batch_metadata: tp.List[tp.Dict[str, tp.Any]],
+        device: tp.Union[torch.device, str],
+    ) -> tp.Dict[str, tp.Any]:
+        output = {}
+        for key, conditioner in self.conditioners.items():
+            condition_key = key
+            conditioner_inputs = []
+            for x in batch_metadata:
+                if condition_key not in x:
+                    if condition_key in self.default_keys:
+                        condition_key = self.default_keys[condition_key]
+                    else:
+                        raise ValueError(
+                            f"Conditioner key {condition_key} not found in batch metadata"
+                        )
+                # Unwrap the condition info if it's a single-element list or tuple, this is to support collation functions that wrap everything in a list
+                if (
+                    isinstance(x[condition_key], list)
+                    or isinstance(x[condition_key], tuple)
+                    and len(x[condition_key]) == 1
+                ):
+                    conditioner_input = x[condition_key][0]
+                else:
+                    conditioner_input = x[condition_key]
+                conditioner_inputs.append(conditioner_input)
+            output[key] = conditioner(conditioner_inputs, device)
+        return output
+def create_multi_conditioner_from_conditioning_config(
+    config: tp.Dict[str, tp.Any],
+) -> MultiConditioner:
+    """
+    Create a MultiConditioner from a conditioning config dictionary
+    Args:
+        config: the conditioning config dictionary
+        device: the device to put the conditioners on
+    """
+    conditioners = {}
+    cond_dim = config["cond_dim"]
+    default_keys = config.get("default_keys", {})
+    for conditioner_info in config["configs"]:
+        id = conditioner_info["id"]
+        conditioner_type = conditioner_info["type"]
+        conditioner_config = {"output_dim": cond_dim}
+        conditioner_config.update(conditioner_info["config"])
+        if conditioner_type == "t5":
+            conditioners[id] = T5Conditioner(**conditioner_config)
+        elif conditioner_type == "clap_text":
+            conditioners[id] = CLAPTextConditioner(**conditioner_config)
+        elif conditioner_type == "clap_audio":
+            conditioners[id] = CLAPAudioConditioner(**conditioner_config)
+        elif conditioner_type == "int":
+            conditioners[id] = IntConditioner(**conditioner_config)
+        elif conditioner_type == "number":
+            conditioners[id] = NumberConditioner(**conditioner_config)
+        elif conditioner_type == "phoneme":
+            conditioners[id] = PhonemeConditioner(**conditioner_config)
+        elif conditioner_type == "lut":
+            conditioners[id] = TokenizerLUTConditioner(**conditioner_config)
+        elif conditioner_type == "pretransform":
+            sample_rate = conditioner_config.pop("sample_rate", None)
+            assert sample_rate is not None, (
+                "Sample rate must be specified for pretransform conditioners"
+            )
+            pretransform = create_pretransform_from_config(
+                conditioner_config.pop("pretransform_config"), sample_rate=sample_rate
+            )
+            if conditioner_config.get("pretransform_ckpt_path", None) is not None:
+                pretransform.load_state_dict(
+                    load_ckpt_state_dict(
+                        conditioner_config.pop("pretransform_ckpt_path")
+                    )
+                )
+            conditioners[id] = PretransformConditioner(
+                pretransform, **conditioner_config
+            )
+        else:
+            raise ValueError(f"Unknown conditioner type: {conditioner_type}")
+    return MultiConditioner(conditioners, default_keys=default_keys)

src/YingMusicSinger/utils/stable_audio_tools/diffusion.py ADDED Viewed

	@@ -0,0 +1,740 @@

+import typing as tp
+from functools import partial
+from time import time
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+# from ..inference.generation import generate_diffusion_cond
+from .adp import UNet1d, UNetCFG1d
+from .blocks import (
+    Downsample1d,
+    Downsample1d_2,
+    FourierFeatures,
+    ResConvBlock,
+    SelfAttention1d,
+    SkipBlock,
+    Upsample1d,
+    Upsample1d_2,
+    expand_to_planes,
+)
+from .conditioners import (
+    MultiConditioner,
+    create_multi_conditioner_from_conditioning_config,
+)
+from .dit import DiffusionTransformer
+from .factory import create_pretransform_from_config
+from .pretransforms import Pretransform
+class Profiler:
+    def __init__(self):
+        self.ticks = [[time(), None]]
+    def tick(self, msg):
+        self.ticks.append([time(), msg])
+    def __repr__(self):
+        rep = 80 * "=" + "\n"
+        for i in range(1, len(self.ticks)):
+            msg = self.ticks[i][1]
+            ellapsed = self.ticks[i][0] - self.ticks[i - 1][0]
+            rep += msg + f": {ellapsed * 1000:.2f}ms\n"
+        rep += 80 * "=" + "\n\n\n"
+        return rep
+class DiffusionModel(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, x, t, **kwargs):
+        raise NotImplementedError()
+class DiffusionModelWrapper(nn.Module):
+    def __init__(
+        self,
+        model: DiffusionModel,
+        io_channels,
+        sample_size,
+        sample_rate,
+        min_input_length,
+        pretransform: tp.Optional[Pretransform] = None,
+    ):
+        super().__init__()
+        self.io_channels = io_channels
+        self.sample_size = sample_size
+        self.sample_rate = sample_rate
+        self.min_input_length = min_input_length
+        self.model = model
+        if pretransform is not None:
+            self.pretransform = pretransform
+        else:
+            self.pretransform = None
+    def forward(self, x, t, **kwargs):
+        return self.model(x, t, **kwargs)
+class ConditionedDiffusionModel(nn.Module):
+    def __init__(
+        self,
+        *args,
+        supports_cross_attention: bool = False,
+        supports_input_concat: bool = False,
+        supports_global_cond: bool = False,
+        supports_prepend_cond: bool = False,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.supports_cross_attention = supports_cross_attention
+        self.supports_input_concat = supports_input_concat
+        self.supports_global_cond = supports_global_cond
+        self.supports_prepend_cond = supports_prepend_cond
+    def forward(
+        self,
+        x: torch.Tensor,
+        t: torch.Tensor,
+        cross_attn_cond: torch.Tensor = None,
+        cross_attn_mask: torch.Tensor = None,
+        input_concat_cond: torch.Tensor = None,
+        global_embed: torch.Tensor = None,
+        prepend_cond: torch.Tensor = None,
+        prepend_cond_mask: torch.Tensor = None,
+        cfg_scale: float = 1.0,
+        cfg_dropout_prob: float = 0.0,
+        batch_cfg: bool = False,
+        rescale_cfg: bool = False,
+        **kwargs,
+    ):
+        raise NotImplementedError()
+class ConditionedDiffusionModelWrapper(nn.Module):
+    """
+    A diffusion model that takes in conditioning
+    """
+    def __init__(
+        self,
+        model: ConditionedDiffusionModel,
+        conditioner: MultiConditioner,
+        io_channels,
+        sample_rate,
+        min_input_length: int,
+        diffusion_objective: tp.Literal["v", "rectified_flow"] = "v",
+        pretransform: tp.Optional[Pretransform] = None,
+        cross_attn_cond_ids: tp.List[str] = [],
+        global_cond_ids: tp.List[str] = [],
+        input_concat_ids: tp.List[str] = [],
+        prepend_cond_ids: tp.List[str] = [],
+    ):
+        super().__init__()
+        self.model = model
+        self.conditioner = conditioner
+        self.io_channels = io_channels
+        self.sample_rate = sample_rate
+        self.diffusion_objective = diffusion_objective
+        self.pretransform = pretransform
+        self.cross_attn_cond_ids = cross_attn_cond_ids
+        self.global_cond_ids = global_cond_ids
+        self.input_concat_ids = input_concat_ids
+        self.prepend_cond_ids = prepend_cond_ids
+        self.min_input_length = min_input_length
+    def get_conditioning_inputs(
+        self, conditioning_tensors: tp.Dict[str, tp.Any], negative=False
+    ):
+        cross_attention_input = None
+        cross_attention_masks = None
+        global_cond = None
+        input_concat_cond = None
+        prepend_cond = None
+        prepend_cond_mask = None
+        if len(self.cross_attn_cond_ids) > 0:
+            # Concatenate all cross-attention inputs over the sequence dimension
+            # Assumes that the cross-attention inputs are of shape (batch, seq, channels)
+            cross_attention_input = []
+            cross_attention_masks = []
+            for key in self.cross_attn_cond_ids:
+                cross_attn_in, cross_attn_mask = conditioning_tensors[key]
+                # Add sequence dimension if it's not there
+                if len(cross_attn_in.shape) == 2:
+                    cross_attn_in = cross_attn_in.unsqueeze(1)
+                    cross_attn_mask = cross_attn_mask.unsqueeze(1)
+                cross_attention_input.append(cross_attn_in)
+                cross_attention_masks.append(cross_attn_mask)
+            cross_attention_input = torch.cat(cross_attention_input, dim=1)
+            cross_attention_masks = torch.cat(cross_attention_masks, dim=1)
+        if len(self.global_cond_ids) > 0:
+            # Concatenate all global conditioning inputs over the channel dimension
+            # Assumes that the global conditioning inputs are of shape (batch, channels)
+            global_conds = []
+            for key in self.global_cond_ids:
+                global_cond_input = conditioning_tensors[key][0]
+                global_conds.append(global_cond_input)
+            # Concatenate over the channel dimension
+            global_cond = torch.cat(global_conds, dim=-1)
+            if len(global_cond.shape) == 3:
+                global_cond = global_cond.squeeze(1)
+        if len(self.input_concat_ids) > 0:
+            # Concatenate all input concat conditioning inputs over the channel dimension
+            # Assumes that the input concat conditioning inputs are of shape (batch, channels, seq)
+            input_concat_cond = torch.cat(
+                [conditioning_tensors[key][0] for key in self.input_concat_ids], dim=1
+            )
+        if len(self.prepend_cond_ids) > 0:
+            # Concatenate all prepend conditioning inputs over the sequence dimension
+            # Assumes that the prepend conditioning inputs are of shape (batch, seq, channels)
+            prepend_conds = []
+            prepend_cond_masks = []
+            for key in self.prepend_cond_ids:
+                prepend_cond_input, prepend_cond_mask = conditioning_tensors[key]
+                prepend_conds.append(prepend_cond_input)
+                prepend_cond_masks.append(prepend_cond_mask)
+            prepend_cond = torch.cat(prepend_conds, dim=1)
+            prepend_cond_mask = torch.cat(prepend_cond_masks, dim=1)
+        if negative:
+            return {
+                "negative_cross_attn_cond": cross_attention_input,
+                "negative_cross_attn_mask": cross_attention_masks,
+                "negative_global_cond": global_cond,
+                "negative_input_concat_cond": input_concat_cond,
+            }
+        else:
+            return {
+                "cross_attn_cond": cross_attention_input,
+                "cross_attn_mask": cross_attention_masks,
+                "global_cond": global_cond,
+                "input_concat_cond": input_concat_cond,
+                "prepend_cond": prepend_cond,
+                "prepend_cond_mask": prepend_cond_mask,
+            }
+    def forward(
+        self, x: torch.Tensor, t: torch.Tensor, cond: tp.Dict[str, tp.Any], **kwargs
+    ):
+        return self.model(x, t, **self.get_conditioning_inputs(cond), **kwargs)
+    def generate(self, *args, **kwargs):
+        return generate_diffusion_cond(self, *args, **kwargs)
+class UNetCFG1DWrapper(ConditionedDiffusionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(
+            supports_cross_attention=True,
+            supports_global_cond=True,
+            supports_input_concat=True,
+        )
+        self.model = UNetCFG1d(*args, **kwargs)
+        with torch.no_grad():
+            for param in self.model.parameters():
+                param *= 0.5
+    def forward(
+        self,
+        x,
+        t,
+        cross_attn_cond=None,
+        cross_attn_mask=None,
+        input_concat_cond=None,
+        global_cond=None,
+        cfg_scale=1.0,
+        cfg_dropout_prob: float = 0.0,
+        batch_cfg: bool = False,
+        rescale_cfg: bool = False,
+        negative_cross_attn_cond=None,
+        negative_cross_attn_mask=None,
+        negative_global_cond=None,
+        negative_input_concat_cond=None,
+        prepend_cond=None,
+        prepend_cond_mask=None,
+        **kwargs,
+    ):
+        p = Profiler()
+        p.tick("start")
+        channels_list = None
+        if input_concat_cond is not None:
+            channels_list = [input_concat_cond]
+        outputs = self.model(
+            x,
+            t,
+            embedding=cross_attn_cond,
+            embedding_mask=cross_attn_mask,
+            features=global_cond,
+            channels_list=channels_list,
+            embedding_scale=cfg_scale,
+            embedding_mask_proba=cfg_dropout_prob,
+            batch_cfg=batch_cfg,
+            rescale_cfg=rescale_cfg,
+            negative_embedding=negative_cross_attn_cond,
+            negative_embedding_mask=negative_cross_attn_mask,
+            **kwargs,
+        )
+        p.tick("UNetCFG1D forward")
+        # print(f"Profiler: {p}")
+        return outputs
+class UNet1DCondWrapper(ConditionedDiffusionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(
+            supports_cross_attention=False,
+            supports_global_cond=True,
+            supports_input_concat=True,
+        )
+        self.model = UNet1d(*args, **kwargs)
+        with torch.no_grad():
+            for param in self.model.parameters():
+                param *= 0.5
+    def forward(
+        self,
+        x,
+        t,
+        input_concat_cond=None,
+        global_cond=None,
+        cross_attn_cond=None,
+        cross_attn_mask=None,
+        prepend_cond=None,
+        prepend_cond_mask=None,
+        cfg_scale=1.0,
+        cfg_dropout_prob: float = 0.0,
+        batch_cfg: bool = False,
+        rescale_cfg: bool = False,
+        negative_cross_attn_cond=None,
+        negative_cross_attn_mask=None,
+        negative_global_cond=None,
+        negative_input_concat_cond=None,
+        **kwargs,
+    ):
+        channels_list = None
+        if input_concat_cond is not None:
+            # Interpolate input_concat_cond to the same length as x
+            if input_concat_cond.shape[2] != x.shape[2]:
+                input_concat_cond = F.interpolate(
+                    input_concat_cond, (x.shape[2],), mode="nearest"
+                )
+            channels_list = [input_concat_cond]
+        outputs = self.model(
+            x, t, features=global_cond, channels_list=channels_list, **kwargs
+        )
+        return outputs
+class UNet1DUncondWrapper(DiffusionModel):
+    def __init__(self, in_channels, *args, **kwargs):
+        super().__init__()
+        self.model = UNet1d(in_channels=in_channels, *args, **kwargs)
+        self.io_channels = in_channels
+        with torch.no_grad():
+            for param in self.model.parameters():
+                param *= 0.5
+    def forward(self, x, t, **kwargs):
+        return self.model(x, t, **kwargs)
+class DAU1DCondWrapper(ConditionedDiffusionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(
+            supports_cross_attention=False,
+            supports_global_cond=False,
+            supports_input_concat=True,
+        )
+        self.model = DiffusionAttnUnet1D(*args, **kwargs)
+        with torch.no_grad():
+            for param in self.model.parameters():
+                param *= 0.5
+    def forward(
+        self,
+        x,
+        t,
+        input_concat_cond=None,
+        cross_attn_cond=None,
+        cross_attn_mask=None,
+        global_cond=None,
+        cfg_scale=1.0,
+        cfg_dropout_prob: float = 0.0,
+        batch_cfg: bool = False,
+        rescale_cfg: bool = False,
+        negative_cross_attn_cond=None,
+        negative_cross_attn_mask=None,
+        negative_global_cond=None,
+        negative_input_concat_cond=None,
+        prepend_cond=None,
+        **kwargs,
+    ):
+        return self.model(x, t, cond=input_concat_cond)
+class DiffusionAttnUnet1D(nn.Module):
+    def __init__(
+        self,
+        io_channels=2,
+        depth=14,
+        n_attn_layers=6,
+        channels=[128, 128, 256, 256] + [512] * 10,
+        cond_dim=0,
+        cond_noise_aug=False,
+        kernel_size=5,
+        learned_resample=False,
+        strides=[2] * 13,
+        conv_bias=True,
+        use_snake=False,
+    ):
+        super().__init__()
+        self.cond_noise_aug = cond_noise_aug
+        self.io_channels = io_channels
+        if self.cond_noise_aug:
+            self.rng = torch.quasirandom.SobolEngine(1, scramble=True)
+        self.timestep_embed = FourierFeatures(1, 16)
+        attn_layer = depth - n_attn_layers
+        strides = [1] + strides
+        block = nn.Identity()
+        conv_block = partial(
+            ResConvBlock,
+            kernel_size=kernel_size,
+            conv_bias=conv_bias,
+            use_snake=use_snake,
+        )
+        for i in range(depth, 0, -1):
+            c = channels[i - 1]
+            stride = strides[i - 1]
+            if stride > 2 and not learned_resample:
+                raise ValueError("Must have stride 2 without learned resampling")
+            if i > 1:
+                c_prev = channels[i - 2]
+                add_attn = i >= attn_layer and n_attn_layers > 0
+                block = SkipBlock(
+                    Downsample1d_2(c_prev, c_prev, stride)
+                    if (learned_resample or stride == 1)
+                    else Downsample1d("cubic"),
+                    conv_block(c_prev, c, c),
+                    SelfAttention1d(c, c // 32) if add_attn else nn.Identity(),
+                    conv_block(c, c, c),
+                    SelfAttention1d(c, c // 32) if add_attn else nn.Identity(),
+                    conv_block(c, c, c),
+                    SelfAttention1d(c, c // 32) if add_attn else nn.Identity(),
+                    block,
+                    conv_block(c * 2 if i != depth else c, c, c),
+                    SelfAttention1d(c, c // 32) if add_attn else nn.Identity(),
+                    conv_block(c, c, c),
+                    SelfAttention1d(c, c // 32) if add_attn else nn.Identity(),
+                    conv_block(c, c, c_prev),
+                    SelfAttention1d(c_prev, c_prev // 32)
+                    if add_attn
+                    else nn.Identity(),
+                    Upsample1d_2(c_prev, c_prev, stride)
+                    if learned_resample
+                    else Upsample1d(kernel="cubic"),
+                )
+            else:
+                cond_embed_dim = 16 if not self.cond_noise_aug else 32
+                block = nn.Sequential(
+                    conv_block((io_channels + cond_dim) + cond_embed_dim, c, c),
+                    conv_block(c, c, c),
+                    conv_block(c, c, c),
+                    block,
+                    conv_block(c * 2, c, c),
+                    conv_block(c, c, c),
+                    conv_block(c, c, io_channels, is_last=True),
+                )
+        self.net = block
+        with torch.no_grad():
+            for param in self.net.parameters():
+                param *= 0.5
+    def forward(self, x, t, cond=None, cond_aug_scale=None):
+        timestep_embed = expand_to_planes(self.timestep_embed(t[:, None]), x.shape)
+        inputs = [x, timestep_embed]
+        if cond is not None:
+            if cond.shape[2] != x.shape[2]:
+                cond = F.interpolate(
+                    cond, (x.shape[2],), mode="linear", align_corners=False
+                )
+            if self.cond_noise_aug:
+                # Get a random number between 0 and 1, uniformly sampled
+                if cond_aug_scale is None:
+                    aug_level = self.rng.draw(cond.shape[0])[:, 0].to(cond)
+                else:
+                    aug_level = (
+                        torch.tensor([cond_aug_scale]).repeat([cond.shape[0]]).to(cond)
+                    )
+                # Add noise to the conditioning signal
+                cond = cond + torch.randn_like(cond) * aug_level[:, None, None]
+                # Get embedding for noise cond level, reusing timestamp_embed
+                aug_level_embed = expand_to_planes(
+                    self.timestep_embed(aug_level[:, None]), x.shape
+                )
+                inputs.append(aug_level_embed)
+            inputs.append(cond)
+        outputs = self.net(torch.cat(inputs, dim=1))
+        return outputs
+class DiTWrapper(ConditionedDiffusionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(
+            supports_cross_attention=True,
+            supports_global_cond=False,
+            supports_input_concat=False,
+        )
+        self.model = DiffusionTransformer(*args, **kwargs)
+        with torch.no_grad():
+            for param in self.model.parameters():
+                param *= 0.5
+    def forward(
+        self,
+        x,
+        t,
+        cross_attn_cond=None,
+        cross_attn_mask=None,
+        negative_cross_attn_cond=None,
+        negative_cross_attn_mask=None,
+        input_concat_cond=None,
+        negative_input_concat_cond=None,
+        global_cond=None,
+        negative_global_cond=None,
+        prepend_cond=None,
+        prepend_cond_mask=None,
+        cfg_scale=1.0,
+        cfg_dropout_prob: float = 0.0,
+        batch_cfg: bool = True,
+        rescale_cfg: bool = False,
+        scale_phi: float = 0.0,
+        **kwargs,
+    ):
+        assert batch_cfg, "batch_cfg must be True for DiTWrapper"
+        # assert negative_input_concat_cond is None, "negative_input_concat_cond is not supported for DiTWrapper"
+        return self.model(
+            x,
+            t,
+            cross_attn_cond=cross_attn_cond,
+            cross_attn_cond_mask=cross_attn_mask,
+            negative_cross_attn_cond=negative_cross_attn_cond,
+            negative_cross_attn_mask=negative_cross_attn_mask,
+            input_concat_cond=input_concat_cond,
+            prepend_cond=prepend_cond,
+            prepend_cond_mask=prepend_cond_mask,
+            cfg_scale=cfg_scale,
+            cfg_dropout_prob=cfg_dropout_prob,
+            scale_phi=scale_phi,
+            global_embed=global_cond,
+            **kwargs,
+        )
+class DiTUncondWrapper(DiffusionModel):
+    def __init__(self, in_channels, *args, **kwargs):
+        super().__init__()
+        self.model = DiffusionTransformer(io_channels=in_channels, *args, **kwargs)
+        self.io_channels = in_channels
+        with torch.no_grad():
+            for param in self.model.parameters():
+                param *= 0.5
+    def forward(self, x, t, **kwargs):
+        return self.model(x, t, **kwargs)
+def create_diffusion_uncond_from_config(config: tp.Dict[str, tp.Any]):
+    diffusion_uncond_config = config["model"]
+    model_type = diffusion_uncond_config.get("type", None)
+    diffusion_config = diffusion_uncond_config.get("config", {})
+    assert model_type is not None, "Must specify model type in config"
+    pretransform = diffusion_uncond_config.get("pretransform", None)
+    sample_size = config.get("sample_size", None)
+    assert sample_size is not None, "Must specify sample size in config"
+    sample_rate = config.get("sample_rate", None)
+    assert sample_rate is not None, "Must specify sample rate in config"
+    if pretransform is not None:
+        pretransform = create_pretransform_from_config(pretransform, sample_rate)
+        min_input_length = pretransform.downsampling_ratio
+    else:
+        min_input_length = 1
+    if model_type == "DAU1d":
+        model = DiffusionAttnUnet1D(**diffusion_config)
+    elif model_type == "adp_uncond_1d":
+        model = UNet1DUncondWrapper(**diffusion_config)
+    elif model_type == "dit":
+        model = DiTUncondWrapper(**diffusion_config)
+    else:
+        raise NotImplementedError(f"Unknown model type: {model_type}")
+    return DiffusionModelWrapper(
+        model,
+        io_channels=model.io_channels,
+        sample_size=sample_size,
+        sample_rate=sample_rate,
+        pretransform=pretransform,
+        min_input_length=min_input_length,
+    )
+def create_diffusion_cond_from_config(config: tp.Dict[str, tp.Any]):
+    model_config = config["model"]
+    model_type = config["model_type"]
+    diffusion_config = model_config.get("diffusion", None)
+    assert diffusion_config is not None, "Must specify diffusion config"
+    diffusion_model_type = diffusion_config.get("type", None)
+    assert diffusion_model_type is not None, "Must specify diffusion model type"
+    diffusion_model_config = diffusion_config.get("config", None)
+    assert diffusion_model_config is not None, "Must specify diffusion model config"
+    if diffusion_model_type == "adp_cfg_1d":
+        diffusion_model = UNetCFG1DWrapper(**diffusion_model_config)
+    elif diffusion_model_type == "adp_1d":
+        diffusion_model = UNet1DCondWrapper(**diffusion_model_config)
+    elif diffusion_model_type == "dit":
+        diffusion_model = DiTWrapper(**diffusion_model_config)
+    io_channels = model_config.get("io_channels", None)
+    assert io_channels is not None, "Must specify io_channels in model config"
+    sample_rate = config.get("sample_rate", None)
+    assert sample_rate is not None, "Must specify sample_rate in config"
+    diffusion_objective = diffusion_config.get("diffusion_objective", "v")
+    conditioning_config = model_config.get("conditioning", None)
+    conditioner = None
+    if conditioning_config is not None:
+        conditioner = create_multi_conditioner_from_conditioning_config(
+            conditioning_config
+        )
+    cross_attention_ids = diffusion_config.get("cross_attention_cond_ids", [])
+    global_cond_ids = diffusion_config.get("global_cond_ids", [])
+    input_concat_ids = diffusion_config.get("input_concat_ids", [])
+    prepend_cond_ids = diffusion_config.get("prepend_cond_ids", [])
+    pretransform = model_config.get("pretransform", None)
+    if pretransform is not None:
+        pretransform = create_pretransform_from_config(pretransform, sample_rate)
+        min_input_length = pretransform.downsampling_ratio
+    else:
+        min_input_length = 1
+    if diffusion_model_type == "adp_cfg_1d" or diffusion_model_type == "adp_1d":
+        min_input_length *= np.prod(diffusion_model_config["factors"])
+    elif diffusion_model_type == "dit":
+        min_input_length *= diffusion_model.model.patch_size
+    # Get the proper wrapper class
+    extra_kwargs = {}
+    if model_type == "diffusion_cond" or model_type == "diffusion_cond_inpaint":
+        wrapper_fn = ConditionedDiffusionModelWrapper
+        extra_kwargs["diffusion_objective"] = diffusion_objective
+    elif model_type == "diffusion_prior":
+        prior_type = model_config.get("prior_type", None)
+        assert prior_type is not None, (
+            "Must specify prior_type in diffusion prior model config"
+        )
+        if prior_type == "mono_stereo":
+            from .diffusion_prior import MonoToStereoDiffusionPrior
+            wrapper_fn = MonoToStereoDiffusionPrior
+    return wrapper_fn(
+        diffusion_model,
+        conditioner,
+        min_input_length=min_input_length,
+        sample_rate=sample_rate,
+        cross_attn_cond_ids=cross_attention_ids,
+        global_cond_ids=global_cond_ids,
+        input_concat_ids=input_concat_ids,
+        prepend_cond_ids=prepend_cond_ids,
+        pretransform=pretransform,
+        io_channels=io_channels,
+        **extra_kwargs,
+    )

src/YingMusicSinger/utils/stable_audio_tools/dit.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import typing as tp
+import torch
+from einops import rearrange
+from torch import nn
+from torch.nn import functional as F
+from x_transformers import ContinuousTransformerWrapper, Encoder
+from .blocks import FourierFeatures
+from .transformer import ContinuousTransformer
+class DiffusionTransformer(nn.Module):
+    def __init__(
+        self,
+        io_channels=32,
+        patch_size=1,
+        embed_dim=768,
+        cond_token_dim=0,
+        project_cond_tokens=True,
+        global_cond_dim=0,
+        project_global_cond=True,
+        input_concat_dim=0,
+        prepend_cond_dim=0,
+        depth=12,
+        num_heads=8,
+        transformer_type: tp.Literal[
+            "x-transformers", "continuous_transformer"
+        ] = "x-transformers",
+        global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend",
+        **kwargs,
+    ):
+        super().__init__()
+        self.cond_token_dim = cond_token_dim
+        # Timestep embeddings
+        timestep_features_dim = 256
+        self.timestep_features = FourierFeatures(1, timestep_features_dim)
+        self.to_timestep_embed = nn.Sequential(
+            nn.Linear(timestep_features_dim, embed_dim, bias=True),
+            nn.SiLU(),
+            nn.Linear(embed_dim, embed_dim, bias=True),
+        )
+        if cond_token_dim > 0:
+            # Conditioning tokens
+            cond_embed_dim = cond_token_dim if not project_cond_tokens else embed_dim
+            self.to_cond_embed = nn.Sequential(
+                nn.Linear(cond_token_dim, cond_embed_dim, bias=False),
+                nn.SiLU(),
+                nn.Linear(cond_embed_dim, cond_embed_dim, bias=False),
+            )
+        else:
+            cond_embed_dim = 0
+        if global_cond_dim > 0:
+            # Global conditioning
+            global_embed_dim = global_cond_dim if not project_global_cond else embed_dim
+            self.to_global_embed = nn.Sequential(
+                nn.Linear(global_cond_dim, global_embed_dim, bias=False),
+                nn.SiLU(),
+                nn.Linear(global_embed_dim, global_embed_dim, bias=False),
+            )
+        if prepend_cond_dim > 0:
+            # Prepend conditioning
+            self.to_prepend_embed = nn.Sequential(
+                nn.Linear(prepend_cond_dim, embed_dim, bias=False),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=False),
+            )
+        self.input_concat_dim = input_concat_dim
+        dim_in = io_channels + self.input_concat_dim
+        self.patch_size = patch_size
+        # Transformer
+        self.transformer_type = transformer_type
+        self.global_cond_type = global_cond_type
+        if self.transformer_type == "x-transformers":
+            self.transformer = ContinuousTransformerWrapper(
+                dim_in=dim_in * patch_size,
+                dim_out=io_channels * patch_size,
+                max_seq_len=0,  # Not relevant without absolute positional embeds
+                attn_layers=Encoder(
+                    dim=embed_dim,
+                    depth=depth,
+                    heads=num_heads,
+                    attn_flash=True,
+                    cross_attend=cond_token_dim > 0,
+                    dim_context=None if cond_embed_dim == 0 else cond_embed_dim,
+                    zero_init_branch_output=True,
+                    use_abs_pos_emb=False,
+                    rotary_pos_emb=True,
+                    ff_swish=True,
+                    ff_glu=True,
+                    **kwargs,
+                ),
+            )
+        elif self.transformer_type == "continuous_transformer":
+            global_dim = None
+            if self.global_cond_type == "adaLN":
+                # The global conditioning is projected to the embed_dim already at this point
+                global_dim = embed_dim
+            self.transformer = ContinuousTransformer(
+                dim=embed_dim,
+                depth=depth,
+                dim_heads=embed_dim // num_heads,
+                dim_in=dim_in * patch_size,
+                dim_out=io_channels * patch_size,
+                cross_attend=cond_token_dim > 0,
+                cond_token_dim=cond_embed_dim,
+                global_cond_dim=global_dim,
+                **kwargs,
+            )
+        else:
+            raise ValueError(f"Unknown transformer type: {self.transformer_type}")
+        self.preprocess_conv = nn.Conv1d(dim_in, dim_in, 1, bias=False)
+        nn.init.zeros_(self.preprocess_conv.weight)
+        self.postprocess_conv = nn.Conv1d(io_channels, io_channels, 1, bias=False)
+        nn.init.zeros_(self.postprocess_conv.weight)
+    def _forward(
+        self,
+        x,
+        t,
+        mask=None,
+        cross_attn_cond=None,
+        cross_attn_cond_mask=None,
+        input_concat_cond=None,
+        global_embed=None,
+        prepend_cond=None,
+        prepend_cond_mask=None,
+        return_info=False,
+        **kwargs,
+    ):
+        if cross_attn_cond is not None:
+            cross_attn_cond = self.to_cond_embed(cross_attn_cond)
+        if global_embed is not None:
+            # Project the global conditioning to the embedding dimension
+            global_embed = self.to_global_embed(global_embed)
+        prepend_inputs = None
+        prepend_mask = None
+        prepend_length = 0
+        if prepend_cond is not None:
+            # Project the prepend conditioning to the embedding dimension
+            prepend_cond = self.to_prepend_embed(prepend_cond)
+            prepend_inputs = prepend_cond
+            if prepend_cond_mask is not None:
+                prepend_mask = prepend_cond_mask
+        if input_concat_cond is not None:
+            # Interpolate input_concat_cond to the same length as x
+            if input_concat_cond.shape[2] != x.shape[2]:
+                input_concat_cond = F.interpolate(
+                    input_concat_cond, (x.shape[2],), mode="nearest"
+                )
+            x = torch.cat([x, input_concat_cond], dim=1)
+        # Get the batch of timestep embeddings
+        timestep_embed = self.to_timestep_embed(
+            self.timestep_features(t[:, None])
+        )  # (b, embed_dim)
+        # Timestep embedding is considered a global embedding. Add to the global conditioning if it exists
+        if global_embed is not None:
+            global_embed = global_embed + timestep_embed
+        else:
+            global_embed = timestep_embed
+        # Add the global_embed to the prepend inputs if there is no global conditioning support in the transformer
+        if self.global_cond_type == "prepend":
+            if prepend_inputs is None:
+                # Prepend inputs are just the global embed, and the mask is all ones
+                prepend_inputs = global_embed.unsqueeze(1)
+                prepend_mask = torch.ones(
+                    (x.shape[0], 1), device=x.device, dtype=torch.bool
+                )
+            else:
+                # Prepend inputs are the prepend conditioning + the global embed
+                prepend_inputs = torch.cat(
+                    [prepend_inputs, global_embed.unsqueeze(1)], dim=1
+                )
+                prepend_mask = torch.cat(
+                    [
+                        prepend_mask,
+                        torch.ones((x.shape[0], 1), device=x.device, dtype=torch.bool),
+                    ],
+                    dim=1,
+                )
+            prepend_length = prepend_inputs.shape[1]
+        x = self.preprocess_conv(x) + x
+        x = rearrange(x, "b c t -> b t c")
+        extra_args = {}
+        if self.global_cond_type == "adaLN":
+            extra_args["global_cond"] = global_embed
+        if self.patch_size > 1:
+            x = rearrange(x, "b (t p) c -> b t (c p)", p=self.patch_size)
+        if self.transformer_type == "x-transformers":
+            output = self.transformer(
+                x,
+                prepend_embeds=prepend_inputs,
+                context=cross_attn_cond,
+                context_mask=cross_attn_cond_mask,
+                mask=mask,
+                prepend_mask=prepend_mask,
+                **extra_args,
+                **kwargs,
+            )
+        elif self.transformer_type == "continuous_transformer":
+            output = self.transformer(
+                x,
+                prepend_embeds=prepend_inputs,
+                context=cross_attn_cond,
+                context_mask=cross_attn_cond_mask,
+                mask=mask,
+                prepend_mask=prepend_mask,
+                return_info=return_info,
+                **extra_args,
+                **kwargs,
+            )
+            if return_info:
+                output, info = output
+        elif self.transformer_type == "mm_transformer":
+            output = self.transformer(
+                x,
+                context=cross_attn_cond,
+                mask=mask,
+                context_mask=cross_attn_cond_mask,
+                **extra_args,
+                **kwargs,
+            )
+        output = rearrange(output, "b t c -> b c t")[:, :, prepend_length:]
+        if self.patch_size > 1:
+            output = rearrange(output, "b (c p) t -> b c (t p)", p=self.patch_size)
+        output = self.postprocess_conv(output) + output
+        if return_info:
+            return output, info
+        return output
+    def forward(
+        self,
+        x,
+        t,
+        cross_attn_cond=None,
+        cross_attn_cond_mask=None,
+        negative_cross_attn_cond=None,
+        negative_cross_attn_mask=None,
+        input_concat_cond=None,
+        global_embed=None,
+        negative_global_embed=None,
+        prepend_cond=None,
+        prepend_cond_mask=None,
+        cfg_scale=1.0,
+        cfg_dropout_prob=0.0,
+        causal=False,
+        scale_phi=0.0,
+        mask=None,
+        return_info=False,
+        **kwargs,
+    ):
+        assert causal == False, "Causal mode is not supported for DiffusionTransformer"
+        if cross_attn_cond_mask is not None:
+            cross_attn_cond_mask = cross_attn_cond_mask.bool()
+            cross_attn_cond_mask = None  # Temporarily disabling conditioning masks due to kernel issue for flash attention
+        if prepend_cond_mask is not None:
+            prepend_cond_mask = prepend_cond_mask.bool()
+        # CFG dropout
+        if cfg_dropout_prob > 0.0:
+            if cross_attn_cond is not None:
+                null_embed = torch.zeros_like(
+                    cross_attn_cond, device=cross_attn_cond.device
+                )
+                dropout_mask = torch.bernoulli(
+                    torch.full(
+                        (cross_attn_cond.shape[0], 1, 1),
+                        cfg_dropout_prob,
+                        device=cross_attn_cond.device,
+                    )
+                ).to(torch.bool)
+                cross_attn_cond = torch.where(dropout_mask, null_embed, cross_attn_cond)
+            if prepend_cond is not None:
+                null_embed = torch.zeros_like(prepend_cond, device=prepend_cond.device)
+                dropout_mask = torch.bernoulli(
+                    torch.full(
+                        (prepend_cond.shape[0], 1, 1),
+                        cfg_dropout_prob,
+                        device=prepend_cond.device,
+                    )
+                ).to(torch.bool)
+                prepend_cond = torch.where(dropout_mask, null_embed, prepend_cond)
+        if cfg_scale != 1.0 and (
+            cross_attn_cond is not None or prepend_cond is not None
+        ):
+            # Classifier-free guidance
+            # Concatenate conditioned and unconditioned inputs on the batch dimension
+            batch_inputs = torch.cat([x, x], dim=0)
+            batch_timestep = torch.cat([t, t], dim=0)
+            if global_embed is not None:
+                batch_global_cond = torch.cat([global_embed, global_embed], dim=0)
+            else:
+                batch_global_cond = None
+            if input_concat_cond is not None:
+                batch_input_concat_cond = torch.cat(
+                    [input_concat_cond, input_concat_cond], dim=0
+                )
+            else:
+                batch_input_concat_cond = None
+            batch_cond = None
+            batch_cond_masks = None
+            # Handle CFG for cross-attention conditioning
+            if cross_attn_cond is not None:
+                null_embed = torch.zeros_like(
+                    cross_attn_cond, device=cross_attn_cond.device
+                )
+                # For negative cross-attention conditioning, replace the null embed with the negative cross-attention conditioning
+                if negative_cross_attn_cond is not None:
+                    # If there's a negative cross-attention mask, set the masked tokens to the null embed
+                    if negative_cross_attn_mask is not None:
+                        negative_cross_attn_mask = negative_cross_attn_mask.to(
+                            torch.bool
+                        ).unsqueeze(2)
+                        negative_cross_attn_cond = torch.where(
+                            negative_cross_attn_mask,
+                            negative_cross_attn_cond,
+                            null_embed,
+                        )
+                    batch_cond = torch.cat(
+                        [cross_attn_cond, negative_cross_attn_cond], dim=0
+                    )
+                else:
+                    batch_cond = torch.cat([cross_attn_cond, null_embed], dim=0)
+                if cross_attn_cond_mask is not None:
+                    batch_cond_masks = torch.cat(
+                        [cross_attn_cond_mask, cross_attn_cond_mask], dim=0
+                    )
+            batch_prepend_cond = None
+            batch_prepend_cond_mask = None
+            if prepend_cond is not None:
+                null_embed = torch.zeros_like(prepend_cond, device=prepend_cond.device)
+                batch_prepend_cond = torch.cat([prepend_cond, null_embed], dim=0)
+                if prepend_cond_mask is not None:
+                    batch_prepend_cond_mask = torch.cat(
+                        [prepend_cond_mask, prepend_cond_mask], dim=0
+                    )
+            if mask is not None:
+                batch_masks = torch.cat([mask, mask], dim=0)
+            else:
+                batch_masks = None
+            batch_output = self._forward(
+                batch_inputs,
+                batch_timestep,
+                cross_attn_cond=batch_cond,
+                cross_attn_cond_mask=batch_cond_masks,
+                mask=batch_masks,
+                input_concat_cond=batch_input_concat_cond,
+                global_embed=batch_global_cond,
+                prepend_cond=batch_prepend_cond,
+                prepend_cond_mask=batch_prepend_cond_mask,
+                return_info=return_info,
+                **kwargs,
+            )
+            if return_info:
+                batch_output, info = batch_output
+            cond_output, uncond_output = torch.chunk(batch_output, 2, dim=0)
+            cfg_output = uncond_output + (cond_output - uncond_output) * cfg_scale
+            # CFG Rescale
+            if scale_phi != 0.0:
+                cond_out_std = cond_output.std(dim=1, keepdim=True)
+                out_cfg_std = cfg_output.std(dim=1, keepdim=True)
+                output = (
+                    scale_phi * (cfg_output * (cond_out_std / out_cfg_std))
+                    + (1 - scale_phi) * cfg_output
+                )
+            else:
+                output = cfg_output
+            if return_info:
+                return output, info
+            return output
+        else:
+            return self._forward(
+                x,
+                t,
+                cross_attn_cond=cross_attn_cond,
+                cross_attn_cond_mask=cross_attn_cond_mask,
+                input_concat_cond=input_concat_cond,
+                global_embed=global_embed,
+                prepend_cond=prepend_cond,
+                prepend_cond_mask=prepend_cond_mask,
+                mask=mask,
+                return_info=return_info,
+                **kwargs,
+            )

src/YingMusicSinger/utils/stable_audio_tools/factory.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import json
+def create_model_from_config(model_config):
+    model_type = model_config.get("model_type", None)
+    assert model_type is not None, "model_type must be specified in model config"
+    if model_type == "autoencoder":
+        from .autoencoders import create_autoencoder_from_config
+        return create_autoencoder_from_config(model_config)
+    elif model_type == "diffusion_uncond":
+        from .diffusion import create_diffusion_uncond_from_config
+        return create_diffusion_uncond_from_config(model_config)
+    elif (
+        model_type == "diffusion_cond"
+        or model_type == "diffusion_cond_inpaint"
+        or model_type == "diffusion_prior"
+    ):
+        from .diffusion import create_diffusion_cond_from_config
+        return create_diffusion_cond_from_config(model_config)
+    elif model_type == "diffusion_autoencoder":
+        from .autoencoders import create_diffAE_from_config
+        return create_diffAE_from_config(model_config)
+    elif model_type == "lm":
+        from .lm import create_audio_lm_from_config
+        return create_audio_lm_from_config(model_config)
+    else:
+        raise NotImplementedError(f"Unknown model type: {model_type}")
+def create_model_from_config_path(model_config_path):
+    with open(model_config_path) as f:
+        model_config = json.load(f)
+    return create_model_from_config(model_config)
+def create_pretransform_from_config(pretransform_config, sample_rate):
+    pretransform_type = pretransform_config.get("type", None)
+    assert pretransform_type is not None, (
+        "type must be specified in pretransform config"
+    )
+    if pretransform_type == "autoencoder":
+        from .autoencoders import create_autoencoder_from_config
+        from .pretransforms import AutoencoderPretransform
+        # Create fake top-level config to pass sample rate to autoencoder constructor
+        # This is a bit of a hack but it keeps us from re-defining the sample rate in the config
+        autoencoder_config = {
+            "sample_rate": sample_rate,
+            "model": pretransform_config["config"],
+        }
+        autoencoder = create_autoencoder_from_config(autoencoder_config)
+        scale = pretransform_config.get("scale", 1.0)
+        model_half = pretransform_config.get("model_half", False)
+        iterate_batch = pretransform_config.get("iterate_batch", False)
+        chunked = pretransform_config.get("chunked", False)
+        pretransform = AutoencoderPretransform(
+            autoencoder,
+            scale=scale,
+            model_half=model_half,
+            iterate_batch=iterate_batch,
+            chunked=chunked,
+        )
+    elif pretransform_type == "wavelet":
+        from .pretransforms import WaveletPretransform
+        wavelet_config = pretransform_config["config"]
+        channels = wavelet_config["channels"]
+        levels = wavelet_config["levels"]
+        wavelet = wavelet_config["wavelet"]
+        pretransform = WaveletPretransform(channels, levels, wavelet)
+    elif pretransform_type == "pqmf":
+        from .pretransforms import PQMFPretransform
+        pqmf_config = pretransform_config["config"]
+        pretransform = PQMFPretransform(**pqmf_config)
+    elif pretransform_type == "dac_pretrained":
+        from .pretransforms import PretrainedDACPretransform
+        pretrained_dac_config = pretransform_config["config"]
+        pretransform = PretrainedDACPretransform(**pretrained_dac_config)
+    elif pretransform_type == "audiocraft_pretrained":
+        from .pretransforms import AudiocraftCompressionPretransform
+        audiocraft_config = pretransform_config["config"]
+        pretransform = AudiocraftCompressionPretransform(**audiocraft_config)
+    else:
+        raise NotImplementedError(f"Unknown pretransform type: {pretransform_type}")
+    enable_grad = pretransform_config.get("enable_grad", False)
+    pretransform.enable_grad = enable_grad
+    pretransform.eval().requires_grad_(pretransform.enable_grad)
+    return pretransform
+def create_bottleneck_from_config(bottleneck_config):
+    bottleneck_type = bottleneck_config.get("type", None)
+    assert bottleneck_type is not None, "type must be specified in bottleneck config"
+    if bottleneck_type == "tanh":
+        from .bottleneck import TanhBottleneck
+        bottleneck = TanhBottleneck()
+    elif bottleneck_type == "vae":
+        from .bottleneck import VAEBottleneck
+        bottleneck = VAEBottleneck()
+    elif bottleneck_type == "rvq":
+        from .bottleneck import RVQBottleneck
+        quantizer_params = {
+            "dim": 128,
+            "codebook_size": 1024,
+            "num_quantizers": 8,
+            "decay": 0.99,
+            "kmeans_init": True,
+            "kmeans_iters": 50,
+            "threshold_ema_dead_code": 2,
+        }
+        quantizer_params.update(bottleneck_config["config"])
+        bottleneck = RVQBottleneck(**quantizer_params)
+    elif bottleneck_type == "dac_rvq":
+        from .bottleneck import DACRVQBottleneck
+        bottleneck = DACRVQBottleneck(**bottleneck_config["config"])
+    elif bottleneck_type == "rvq_vae":
+        from .bottleneck import RVQVAEBottleneck
+        quantizer_params = {
+            "dim": 128,
+            "codebook_size": 1024,
+            "num_quantizers": 8,
+            "decay": 0.99,
+            "kmeans_init": True,
+            "kmeans_iters": 50,
+            "threshold_ema_dead_code": 2,
+        }
+        quantizer_params.update(bottleneck_config["config"])
+        bottleneck = RVQVAEBottleneck(**quantizer_params)
+    elif bottleneck_type == "dac_rvq_vae":
+        from .bottleneck import DACRVQVAEBottleneck
+        bottleneck = DACRVQVAEBottleneck(**bottleneck_config["config"])
+    elif bottleneck_type == "l2_norm":
+        from .bottleneck import L2Bottleneck
+        bottleneck = L2Bottleneck()
+    elif bottleneck_type == "wasserstein":
+        from .bottleneck import WassersteinBottleneck
+        bottleneck = WassersteinBottleneck(**bottleneck_config.get("config", {}))
+    elif bottleneck_type == "fsq":
+        from .bottleneck import FSQBottleneck
+        bottleneck = FSQBottleneck(**bottleneck_config["config"])
+    else:
+        raise NotImplementedError(f"Unknown bottleneck type: {bottleneck_type}")
+    requires_grad = bottleneck_config.get("requires_grad", True)
+    if not requires_grad:
+        for param in bottleneck.parameters():
+            param.requires_grad = False
+    return bottleneck

src/YingMusicSinger/utils/stable_audio_tools/pretransforms.py ADDED Viewed

	@@ -0,0 +1,425 @@

+import torch
+from einops import rearrange
+from torch import nn
+class Pretransform(nn.Module):
+    def __init__(self, enable_grad, io_channels, is_discrete):
+        super().__init__()
+        self.is_discrete = is_discrete
+        self.io_channels = io_channels
+        self.encoded_channels = None
+        self.downsampling_ratio = None
+        self.enable_grad = enable_grad
+    def encode(self, x):
+        raise NotImplementedError
+    def decode(self, z):
+        raise NotImplementedError
+    def tokenize(self, x):
+        raise NotImplementedError
+    def decode_tokens(self, tokens):
+        raise NotImplementedError
+class AutoencoderPretransform(Pretransform):
+    def __init__(
+        self, model, scale=1.0, model_half=False, iterate_batch=False, chunked=False
+    ):
+        super().__init__(
+            enable_grad=False,
+            io_channels=model.io_channels,
+            is_discrete=model.bottleneck is not None and model.bottleneck.is_discrete,
+        )
+        self.model = model
+        self.model.requires_grad_(False).eval()
+        self.scale = scale
+        self.downsampling_ratio = model.downsampling_ratio
+        self.io_channels = model.io_channels
+        self.sample_rate = model.sample_rate
+        self.model_half = model_half
+        self.iterate_batch = iterate_batch
+        self.encoded_channels = model.latent_dim
+        self.latent_dim = model.latent_dim
+        self.chunked = chunked
+        self.num_quantizers = (
+            model.bottleneck.num_quantizers
+            if model.bottleneck is not None and model.bottleneck.is_discrete
+            else None
+        )
+        self.codebook_size = (
+            model.bottleneck.codebook_size
+            if model.bottleneck is not None and model.bottleneck.is_discrete
+            else None
+        )
+        if self.model_half:
+            self.model.half()
+    def encode(self, x, **kwargs):
+        if self.model_half:
+            x = x.half()
+            self.model.to(torch.float16)
+        encoded = self.model.encode_audio(
+            x, chunked=self.chunked, iterate_batch=self.iterate_batch, **kwargs
+        )
+        if self.model_half:
+            encoded = encoded.float()
+        return encoded / self.scale
+    def encode_audio(self, audio, chunked=False, overlap=32, chunk_size=128, **kwargs):
+        """
+        Encode audios into latents. Audios should already be preprocesed by preprocess_audio_for_encoder.
+        If chunked is True, split the audio into chunks of a given maximum size chunk_size, with given overlap.
+        Overlap and chunk_size params are both measured in number of latents (not audio samples)
+        # and therefore you likely could use the same values with decode_audio.
+        A overlap of zero will cause discontinuity artefacts. Overlap should be => receptive field size.
+        Every autoencoder will have a different receptive field size, and thus ideal overlap.
+        You can determine it empirically by diffing unchunked vs chunked output and looking at maximum diff.
+        The final chunk may have a longer overlap in order to keep chunk_size consistent for all chunks.
+        Smaller chunk_size uses less memory, but more compute.
+        The chunk_size vs memory tradeoff isn't linear, and possibly depends on the GPU and CUDA version
+        For example, on a A6000 chunk_size 128 is overall faster than 256 and 512 even though it has more chunks
+        """
+        if not chunked:
+            # default behavior. Encode the entire audio in parallel
+            return self.encode(audio, **kwargs)
+        else:
+            # CHUNKED ENCODING
+            # samples_per_latent is just the downsampling ratio (which is also the upsampling ratio)
+            samples_per_latent = self.downsampling_ratio
+            total_size = audio.shape[2]  # in samples
+            batch_size = audio.shape[0]
+            chunk_size *= samples_per_latent  # converting metric in latents to samples
+            overlap *= samples_per_latent  # converting metric in latents to samples
+            hop_size = chunk_size - overlap
+            chunks = []
+            for i in range(0, total_size - chunk_size + 1, hop_size):
+                chunk = audio[:, :, i : i + chunk_size]
+                chunks.append(chunk)
+            if i + chunk_size != total_size:
+                # Final chunk
+                chunk = audio[:, :, -chunk_size:]
+                chunks.append(chunk)
+            chunks = torch.stack(chunks)
+            num_chunks = chunks.shape[0]
+            # Note: y_size might be a different value from the latent length used in diffusion training
+            # because we can encode audio of varying lengths
+            # However, the audio should've been padded to a multiple of samples_per_latent by now.
+            y_size = total_size // samples_per_latent
+            # Create an empty latent, we will populate it with chunks as we encode them
+            y_final = torch.zeros((batch_size, self.latent_dim, y_size)).to(
+                audio.device
+            )
+            for i in range(num_chunks):
+                x_chunk = chunks[i, :]
+                # encode the chunk
+                y_chunk = self.encode(x_chunk)
+                # figure out where to put the audio along the time domain
+                if i == num_chunks - 1:
+                    # final chunk always goes at the end
+                    t_end = y_size
+                    t_start = t_end - y_chunk.shape[2]
+                else:
+                    t_start = i * hop_size // samples_per_latent
+                    t_end = t_start + chunk_size // samples_per_latent
+                #  remove the edges of the overlaps
+                ol = overlap // samples_per_latent // 2
+                chunk_start = 0
+                chunk_end = y_chunk.shape[2]
+                if i > 0:
+                    # no overlap for the start of the first chunk
+                    t_start += ol
+                    chunk_start += ol
+                if i < num_chunks - 1:
+                    # no overlap for the end of the last chunk
+                    t_end -= ol
+                    chunk_end -= ol
+                # paste the chunked audio into our y_final output audio
+                y_final[:, :, t_start:t_end] = y_chunk[:, :, chunk_start:chunk_end]
+            return y_final
+    def decode(self, z, **kwargs):
+        z = z * self.scale
+        if self.model_half:
+            z = z.half()
+            self.model.to(torch.float16)
+        decoded = self.model.decode_audio(
+            z, chunked=self.chunked, iterate_batch=self.iterate_batch, **kwargs
+        )
+        if self.model_half:
+            decoded = decoded.float()
+        return decoded
+    def decode_audio(
+        self, latents, chunked=False, overlap=32, chunk_size=128, **kwargs
+    ):
+        if not chunked:
+            # default behavior. Decode the entire latent in parallel
+            return self.decode(latents, **kwargs)
+        else:
+            # chunked decoding
+            hop_size = chunk_size - overlap
+            total_size = latents.shape[2]
+            batch_size = latents.shape[0]
+            chunks = []
+            i = 0
+            for i in range(0, total_size - chunk_size + 1, hop_size):
+                chunk = latents[:, :, i : i + chunk_size]
+                chunks.append(chunk)
+            if i + chunk_size != total_size:
+                # Final chunk
+                chunk = latents[:, :, -chunk_size:]
+                chunks.append(chunk)
+            chunks = torch.stack(chunks)
+            num_chunks = chunks.shape[0]
+            # samples_per_latent is just the downsampling ratio
+            samples_per_latent = self.downsampling_ratio
+            # Create an empty waveform, we will populate it with chunks as decode them
+            y_size = total_size * samples_per_latent
+            y_final = torch.zeros((batch_size, self.io_channels, y_size)).to(
+                latents.device
+            )
+            for i in range(num_chunks):
+                x_chunk = chunks[i, :]
+                # decode the chunk
+                y_chunk = self.decode(x_chunk)
+                # figure out where to put the audio along the time domain
+                if i == num_chunks - 1:
+                    # final chunk always goes at the end
+                    t_end = y_size
+                    t_start = t_end - y_chunk.shape[2]
+                else:
+                    t_start = i * hop_size * samples_per_latent
+                    t_end = t_start + chunk_size * samples_per_latent
+                #  remove the edges of the overlaps
+                ol = (overlap // 2) * samples_per_latent
+                chunk_start = 0
+                chunk_end = y_chunk.shape[2]
+                if i > 0:
+                    # no overlap for the start of the first chunk
+                    t_start += ol
+                    chunk_start += ol
+                if i < num_chunks - 1:
+                    # no overlap for the end of the last chunk
+                    t_end -= ol
+                    chunk_end -= ol
+                # paste the chunked audio into our y_final output audio
+                y_final[:, :, t_start:t_end] = y_chunk[:, :, chunk_start:chunk_end]
+            return y_final
+    def tokenize(self, x, **kwargs):
+        assert self.model.is_discrete, "Cannot tokenize with a continuous model"
+        _, info = self.model.encode(x, return_info=True, **kwargs)
+        return info[self.model.bottleneck.tokens_id]
+    def decode_tokens(self, tokens, **kwargs):
+        assert self.model.is_discrete, "Cannot decode tokens with a continuous model"
+        return self.model.decode_tokens(tokens, **kwargs)
+    def load_state_dict(self, state_dict, strict=True):
+        self.model.load_state_dict(state_dict, strict=strict)
+class WaveletPretransform(Pretransform):
+    def __init__(self, channels, levels, wavelet):
+        super().__init__(enable_grad=False, io_channels=channels, is_discrete=False)
+        from .wavelets import WaveletDecode1d, WaveletEncode1d
+        self.encoder = WaveletEncode1d(channels, levels, wavelet)
+        self.decoder = WaveletDecode1d(channels, levels, wavelet)
+        self.downsampling_ratio = 2**levels
+        self.io_channels = channels
+        self.encoded_channels = channels * self.downsampling_ratio
+    def encode(self, x):
+        return self.encoder(x)
+    def decode(self, z):
+        return self.decoder(z)
+class PQMFPretransform(Pretransform):
+    def __init__(self, attenuation=100, num_bands=16):
+        # TODO: Fix PQMF to take in in-channels
+        super().__init__(enable_grad=False, io_channels=1, is_discrete=False)
+        from .pqmf import PQMF
+        self.pqmf = PQMF(attenuation, num_bands)
+    def encode(self, x):
+        # x is (Batch x Channels x Time)
+        x = self.pqmf.forward(x)
+        # pqmf.forward returns (Batch x Channels x Bands x Time)
+        # but Pretransform needs Batch x Channels x Time
+        # so concatenate channels and bands into one axis
+        return rearrange(x, "b c n t -> b (c n) t")
+    def decode(self, x):
+        # x is (Batch x (Channels Bands) x Time), convert back to (Batch x Channels x Bands x Time)
+        x = rearrange(x, "b (c n) t -> b c n t", n=self.pqmf.num_bands)
+        # returns (Batch x Channels x Time)
+        return self.pqmf.inverse(x)
+class PretrainedDACPretransform(Pretransform):
+    def __init__(
+        self,
+        model_type="44khz",
+        model_bitrate="8kbps",
+        scale=1.0,
+        quantize_on_decode: bool = True,
+        chunked=True,
+    ):
+        super().__init__(enable_grad=False, io_channels=1, is_discrete=True)
+        import dac
+        model_path = dac.utils.download(
+            model_type=model_type, model_bitrate=model_bitrate
+        )
+        self.model = dac.DAC.load(model_path)
+        self.quantize_on_decode = quantize_on_decode
+        if model_type == "44khz":
+            self.downsampling_ratio = 512
+        else:
+            self.downsampling_ratio = 320
+        self.io_channels = 1
+        self.scale = scale
+        self.chunked = chunked
+        self.encoded_channels = self.model.latent_dim
+        self.num_quantizers = self.model.n_codebooks
+        self.codebook_size = self.model.codebook_size
+    def encode(self, x):
+        latents = self.model.encoder(x)
+        if self.quantize_on_decode:
+            output = latents
+        else:
+            z, _, _, _, _ = self.model.quantizer(
+                latents, n_quantizers=self.model.n_codebooks
+            )
+            output = z
+        if self.scale != 1.0:
+            output = output / self.scale
+        return output
+    def decode(self, z):
+        if self.scale != 1.0:
+            z = z * self.scale
+        if self.quantize_on_decode:
+            z, _, _, _, _ = self.model.quantizer(z, n_quantizers=self.model.n_codebooks)
+        return self.model.decode(z)
+    def tokenize(self, x):
+        return self.model.encode(x)[1]
+    def decode_tokens(self, tokens):
+        latents = self.model.quantizer.from_codes(tokens)
+        return self.model.decode(latents)
+class AudiocraftCompressionPretransform(Pretransform):
+    def __init__(
+        self,
+        model_type="facebook/encodec_32khz",
+        scale=1.0,
+        quantize_on_decode: bool = True,
+    ):
+        super().__init__(enable_grad=False, io_channels=1, is_discrete=True)
+        try:
+            from audiocraft.models import CompressionModel
+        except ImportError:
+            raise ImportError(
+                "Audiocraft is not installed. Please install audiocraft to use Audiocraft models."
+            )
+        self.model = CompressionModel.get_pretrained(model_type)
+        self.quantize_on_decode = quantize_on_decode
+        self.downsampling_ratio = round(self.model.sample_rate / self.model.frame_rate)
+        self.sample_rate = self.model.sample_rate
+        self.io_channels = self.model.channels
+        self.scale = scale
+        # self.encoded_channels = self.model.latent_dim
+        self.num_quantizers = self.model.num_codebooks
+        self.codebook_size = self.model.cardinality
+        self.model.to(torch.float16).eval().requires_grad_(False)
+    def encode(self, x):
+        assert False, "Audiocraft compression models do not support continuous encoding"
+        # latents = self.model.encoder(x)
+        # if self.quantize_on_decode:
+        #     output = latents
+        # else:
+        #     z, _, _, _, _ = self.model.quantizer(latents, n_quantizers=self.model.n_codebooks)
+        #     output = z
+        # if self.scale != 1.0:
+        #     output = output / self.scale
+        # return output
+    def decode(self, z):
+        assert False, "Audiocraft compression models do not support continuous decoding"
+        # if self.scale != 1.0:
+        #     z = z * self.scale
+        # if self.quantize_on_decode:
+        #     z, _, _, _, _ = self.model.quantizer(z, n_quantizers=self.model.n_codebooks)
+        # return self.model.decode(z)
+    def tokenize(self, x):
+        with torch.cuda.amp.autocast(enabled=False):
+            return self.model.encode(x.to(torch.float16))[0]
+    def decode_tokens(self, tokens):
+        with torch.cuda.amp.autocast(enabled=False):
+            return self.model.decode(tokens)