Spaces:

CocoBro
/

MMEdit

Configuration error

App Files Files Community

CocoBro commited on Dec 22, 2025

Commit

c14d03d

1 Parent(s): cdbb4cf

init space

Browse files

Files changed (43) hide show

app.py +358 -63
example/content.jsonl +1 -0
losses/base.py +22 -0
models/__pycache__/common.cpython-310.pyc +0 -0
models/__pycache__/content_adapter.cpython-310.pyc +0 -0
models/__pycache__/diffusion.cpython-310.pyc +0 -0
models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc +0 -0
models/autoencoder/autoencoder_base.py +22 -0
models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc +0 -0
models/autoencoder/waveform/dac.py +0 -0
models/autoencoder/waveform/stable_vae.py +586 -0
models/common.py +79 -0
models/content_adapter.py +430 -0
models/content_encoder/__pycache__/content_encoder.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/llm_encoder.cpython-310.pyc +0 -0
models/content_encoder/content_encoder.py +133 -0
models/content_encoder/llm_encoder.py +215 -0
models/content_encoder/text_encoder.py +76 -0
models/diffusion.py +401 -0
models/dit/__init__.py +0 -0
models/dit/__pycache__/__init__.cpython-310.pyc +0 -0
models/dit/__pycache__/mmdit_back.cpython-310.pyc +0 -0
models/dit/__pycache__/mmdit_layers.cpython-310.pyc +0 -0
models/dit/__pycache__/modules.cpython-310.pyc +0 -0
models/dit/attention.py +350 -0
models/dit/mmdit_back.py +346 -0
models/dit/mmdit_layers.py +421 -0
models/dit/modules.py +445 -0
models/dit/rotary.py +88 -0
models/dit/span_mask.py +149 -0
models/flow_matching.py +1082 -0
requirements.txt +28 -0
stabilityai/stable-diffusion-2-1/scheduler/scheduler_config.json +14 -0
utils/__pycache__/config.cpython-310.pyc +0 -0
utils/__pycache__/torch_utilities.cpython-310.pyc +0 -0
utils/accelerate_utilities.py +13 -0
utils/audio.py +58 -0
utils/config.py +53 -0
utils/diffsinger_utilities.py +551 -0
utils/general.py +68 -0
utils/logging.py +23 -0
utils/lr_scheduler_utilities.py +154 -0
utils/torch_utilities.py +288 -0

app.py CHANGED Viewed

@@ -1,70 +1,365 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
     """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
     """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
-    demo.launch()

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
+import time
+import logging
+from pathlib import Path
+from typing import Tuple, Optional, Dict, Any
 import gradio as gr
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+import librosa
+import hydra
+from omegaconf import OmegaConf
+from safetensors.torch import load_file
+import diffusers.schedulers as noise_schedulers
+from huggingface_hub import snapshot_download
+from models.common import LoadPretrainedBase
+from utils.config import register_omegaconf_resolvers
+# -----------------------------
+# Logging
+# -----------------------------
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger("mmedit_space")
+register_omegaconf_resolvers()
+# ---------------------------------------------------------
+# HF Repo IDs（按你的默认需求）
+# ---------------------------------------------------------
+MMEDIT_REPO_ID = os.environ.get("MMEDIT_REPO_ID", "CocoBro/MMEdit")
+MMEDIT_REVISION = os.environ.get("MMEDIT_REVISION", None)
+QWEN_REPO_ID = os.environ.get("QWEN_REPO_ID", "Qwen/Qwen2-Audio-7B-Instruct")
+QWEN_REVISION = os.environ.get("QWEN_REVISION", None)
+OUTPUT_DIR = Path(os.environ.get("OUTPUT_DIR", "./outputs"))
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+USE_AMP = os.environ.get("USE_AMP", "0") == "1"
+AMP_DTYPE = os.environ.get("AMP_DTYPE", "bf16")  # "bf16" or "fp16"
+_PIPELINE_CACHE: Dict[str, Tuple[LoadPretrainedBase, object, int, torch.device]] = {}
+# ---------------------------------------------------------
+# 下载 repo
+# ---------------------------------------------------------
+def resolve_model_dirs() -> Tuple[Path, Path]:
     """
+    返回：
+      repo_root: 你的 MMEdit repo 的本地目录（包含 config.yaml / model.safetensors / vae/）
+      qwen_root: Qwen2-Audio repo 的本地目录
     """
+    logger.info(f"Downloading MMEdit repo: {MMEDIT_REPO_ID} (revision={MMEDIT_REVISION})")
+    repo_root = snapshot_download(
+        repo_id=MMEDIT_REPO_ID,
+        revision=MMEDIT_REVISION,
+        local_dir=None,
+        local_dir_use_symlinks=False,
+    )
+    repo_root = Path(repo_root).resolve()
+    logger.info(f"Downloading Qwen repo: {QWEN_REPO_ID} (revision={QWEN_REVISION})")
+    qwen_root = snapshot_download(
+        repo_id=QWEN_REPO_ID,
+        revision=QWEN_REVISION,
+        local_dir=None,
+        local_dir_use_symlinks=False,
+    )
+    qwen_root = Path(qwen_root).resolve()
+    return repo_root, qwen_root
+# ---------------------------------------------------------
+# 你的音频加载（按你要求：orig -> 16k -> target_sr）
+# ---------------------------------------------------------
+def load_and_process_audio(audio_path: str, target_sr: int) -> torch.Tensor:
+    path = Path(audio_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Audio file not found: {audio_path}")
+    waveform, orig_sr = torchaudio.load(str(path))  # (C, T)
+    # Convert to mono
+    if waveform.ndim == 2:
+        waveform = waveform.mean(dim=0)  # (T,)
+    elif waveform.ndim > 2:
+        waveform = waveform.reshape(-1)
+    if target_sr and int(target_sr) != int(orig_sr):
+        waveform_np = waveform.cpu().numpy()
+        # 1) 先到 16k
+        sr_mid = 16000
+        if int(orig_sr) != sr_mid:
+            waveform_np = librosa.resample(
+                waveform_np,
+                orig_sr=int(orig_sr),
+                target_sr=sr_mid
+            )
+            orig_sr_mid = sr_mid
+        else:
+            orig_sr_mid = int(orig_sr)
+        # 2) 再到 target_sr（如 24k）
+        if int(target_sr) != orig_sr_mid:
+            waveform_np = librosa.resample(
+                waveform_np,
+                orig_sr=orig_sr_mid,
+                target_sr=int(target_sr)
+            )
+        waveform = torch.from_numpy(waveform_np)
+    return waveform
+# ---------------------------------------------------------
+# 校验 repo 结构
+# ---------------------------------------------------------
+def assert_repo_layout(repo_root: Path) -> None:
+    must = [
+        repo_root / "config.yaml",
+        repo_root / "model.safetensors",
+        repo_root / "vae",
+    ]
+    for p in must:
+        if not p.exists():
+            raise FileNotFoundError(f"Missing required path: {p}")
+    vae_files = list((repo_root / "vae").glob("*.ckpt"))
+    if len(vae_files) == 0:
+        raise FileNotFoundError(f"No .ckpt found under: {repo_root/'vae'}")
+# ---------------------------------------------------------
+# 关键：适配你这个 config.yaml 的路径写法
+# ---------------------------------------------------------
+def patch_paths_in_exp_config(exp_cfg: Dict[str, Any], repo_root: Path, qwen_root: Path) -> None:
+    """
+    适配你 config.yaml：
+      - pretrained_ckpt: ckpt/mmedit/vae/epoch=xx.ckpt  -> repo_root/vae/epoch=xx.ckpt
+      - model_path:      ckpt/qwen2-audio-7B-instruct   -> qwen_root (snapshot_download 结果)
+    """
+    # ---- 1) VAE ckpt ----
+    vae_ckpt = exp_cfg["model"]["autoencoder"].get("pretrained_ckpt", None)
+    if vae_ckpt:
+        vae_ckpt = str(vae_ckpt).replace("\\", "/")
+        # 你这里最稳定的做法：找到 "vae/" 子串之后的后缀
+        # 例如：
+        #   ckpt/mmedit/vae/epoch=13-step=1000000.ckpt  -> vae/epoch=13-step=1000000.ckpt
+        idx = vae_ckpt.find("vae/")
+        if idx != -1:
+            vae_rel = vae_ckpt[idx:]  # 从 vae/ 开始截断
+        else:
+            # 兜底：如果有人直接写 epoch=xx.ckpt，那就放到 repo_root/vae/
+            # 或者写 vae/xxx.ckpt
+            if vae_ckpt.endswith(".ckpt") and "/" not in vae_ckpt:
+                vae_rel = f"vae/{vae_ckpt}"
+            else:
+                vae_rel = vae_ckpt
+        vae_path = (repo_root / vae_rel).resolve()
+        exp_cfg["model"]["autoencoder"]["pretrained_ckpt"] = str(vae_path)
+        if not vae_path.exists():
+            raise FileNotFoundError(
+                f"VAE ckpt not found after patch:\n"
+                f"  original: {vae_ckpt}\n"
+                f"  patched : {vae_path}\n"
+                f"Repo root: {repo_root}\n"
+                f"Expected:  {repo_root/'vae'/'*.ckpt'}"
+            )
+    # ---- 2) Qwen2-Audio model_path ----
+    # 你的 config 里写的是 ckpt/qwen2-audio-7B-instruct，但 Space 上我们直接用下载后的 qwen_root
+    exp_cfg["model"]["content_encoder"]["text_encoder"]["model_path"] = str(qwen_root)
+# ---------------------------------------------------------
+# Scheduler（与你 exp_cfg.model.noise_scheduler_name 对齐）
+# ---------------------------------------------------------
+def build_scheduler(exp_cfg: Dict[str, Any]):
+    name = exp_cfg["model"].get("noise_scheduler_name", "stabilityai/stable-diffusion-2-1")
+    scheduler = noise_schedulers.DDIMScheduler.from_pretrained(name, subfolder="scheduler")
+    return scheduler
+def _amp_ctx(device: torch.device):
+    if not USE_AMP:
+        return torch.autocast("cuda", enabled=False)
+    if device.type != "cuda":
+        return torch.autocast("cpu", enabled=False)
+    dtype = torch.bfloat16 if AMP_DTYPE.lower() == "bf16" else torch.float16
+    return torch.autocast("cuda", dtype=dtype, enabled=True)
+# ---------------------------------------------------------
+# 冷启动：load+cache pipeline
+# ---------------------------------------------------------
+def load_pipeline() -> Tuple[LoadPretrainedBase, object, int, torch.device]:
+    cache_key = f"{MMEDIT_REPO_ID}@{MMEDIT_REVISION}::{QWEN_REPO_ID}@{QWEN_REVISION}"
+    if cache_key in _PIPELINE_CACHE:
+        return _PIPELINE_CACHE[cache_key]
+    repo_root, qwen_root = resolve_model_dirs()
+    assert_repo_layout(repo_root)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    logger.info(f"repo_root = {repo_root}")
+    logger.info(f"device    = {device}")
+    logger.info(f"qwen_root = {qwen_root}")
+    exp_cfg = OmegaConf.load(repo_root / "config.yaml")
+    exp_cfg = OmegaConf.to_container(exp_cfg, resolve=True)
+    patch_paths_in_exp_config(exp_cfg, repo_root, qwen_root)
+    logger.info(f"patched pretrained_ckpt = {exp_cfg['model']['autoencoder'].get('pretrained_ckpt')}")
+    logger.info(f"patched qwen model_path = {exp_cfg['model']['content_encoder']['text_encoder'].get('model_path')}")
+    model: LoadPretrainedBase = hydra.utils.instantiate(exp_cfg["model"], _convert_="all")
+    ckpt_path = repo_root / "model.safetensors"
+    sd = load_file(str(ckpt_path))
+    model.load_pretrained(sd)
+    model = model.to(device).eval()
+    scheduler = build_scheduler(exp_cfg)
+    target_sr = int(exp_cfg.get("sample_rate", 24000))
+    _PIPELINE_CACHE[cache_key] = (model, scheduler, target_sr, device)
+    logger.info("Pipeline loaded and cached.")
+    return model, scheduler, target_sr, device
+# ---------------------------------------------------------
+# 推理：audio + caption -> edited audio
+# ---------------------------------------------------------
+@torch.no_grad()
+def run_edit(
+    audio_file: str,
+    caption: str,
+    num_steps: int,
+    guidance_scale: float,
+    guidance_rescale: float,
+    seed: int,
+) -> Tuple[Optional[str], str]:
+    if audio_file is None or not Path(audio_file).exists():
+        return None, "Error: please upload an audio file."
+    caption = (caption or "").strip()
+    if not caption:
+        return None, "Error: caption is empty."
+    model, scheduler, target_sr, device = load_pipeline()
+    seed = int(seed)
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    wav = load_and_process_audio(audio_file, target_sr=target_sr).to(device)
+    batch = {
+        "audio_id": [Path(audio_file).stem],
+        "content": [{"audio": wav, "caption": caption}],
+        "task": ["audio_editing"],
+    }
+    # 和你给的 infer.config 对齐
+    kwargs = {
+        "num_steps": int(num_steps),
+        "guidance_scale": float(guidance_scale),
+        "guidance_rescale": float(guidance_rescale),
+        "use_gt_duration": False,
+        "mask_time_aligned_content": False,
+    }
+    kwargs.update(batch)
+    t0 = time.time()
+    with _amp_ctx(device):
+        out = model.inference(scheduler=scheduler, **kwargs)
+    dt = time.time() - t0
+    out_audio = out[0, 0].detach().float().cpu().numpy()
+    out_path = OUTPUT_DIR / f"{Path(audio_file).stem}_edited.wav"
+    sf.write(str(out_path), out_audio, samplerate=target_sr)
+    return str(out_path), f"OK | saved={out_path.name} | time={dt:.2f}s | sr={target_sr} | seed={seed}"
+# ---------------------------------------------------------
+# UI
+# ---------------------------------------------------------
+def build_demo():
+    with gr.Blocks(title="MMEdit Space Simulator") as demo:
+        gr.Markdown("# MMEdit Space 模拟（audio + caption → edited audio）")
+        gr.Markdown(
+            "点下面的示例即可自动填充音频路径与编辑指令，然后点击 Run Editing。"
+        )
+        with gr.Row():
+            with gr.Column():
+                audio_in = gr.Audio(label="Input Audio", type="filepath")
+                caption = gr.Textbox(label="Caption (Edit Instruction)", lines=3)
+                # 一键填充示例：点一下就把 audio_in + caption 填好
+                gr.Examples(
+                    label="example inputs",
+                    examples=[
+                        ["example/Ym8O802VvJes.wav", "Mix in dog barking in the middle."],
+                    ],
+                    inputs=[audio_in, caption],
+                    cache_examples=False,  # 本地/Space 都更稳，不提前缓存
+                )
+                with gr.Row():
+                    num_steps = gr.Slider(1, 100, value=50, step=1, label="num_steps")
+                    guidance_scale = gr.Slider(1.0, 12.0, value=5.0, step=0.5, label="guidance_scale")
+                with gr.Row():
+                    guidance_rescale = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="guidance_rescale")
+                    seed = gr.Number(value=42, precision=0, label="seed")
+                run_btn = gr.Button("Run Editing", variant="primary")
+            with gr.Column():
+                audio_out = gr.Audio(label="Edited Audio", type="filepath")
+                status = gr.Textbox(label="Status")
+        run_btn.click(
+            fn=run_edit,
+            inputs=[audio_in, caption, num_steps, guidance_scale, guidance_rescale, seed],
+            outputs=[audio_out, status],
+        )
+        gr.Markdown(
+            "## 注意事项\n"
+            "- 首次加载较慢\n"
+            "- Space 上有一些bug，某些情况会损失原始音频\n"
+        )
+    return demo
 if __name__ == "__main__":
+    demo = build_demo()
+    port = int(os.environ.get("PORT", "7860"))  # Space 默认 7860
+    demo.launch(server_name="0.0.0.0", server_port=port, share=False)

example/content.jsonl ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"audio_id": "add_audiocaps_1", "content": "example/Ym8O802VvJes.wav", "caption": "Mix in dog barking in the middle."}

losses/base.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+class IndentityWrapper(nn.Module):
+    def forward(self, loss: torch.Tensor) -> dict[str, torch.Tensor]:
+        return {"loss": loss}
+class LossSumWrapper(nn.Module):
+    def __init__(self, weights: dict[str, float]):
+        super().__init__()
+        self.weights = weights
+    def forward(self,
+                loss_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        total_loss = 0
+        for loss_name, loss_val in loss_dict.items():
+            total_loss += loss_val * self.weights[loss_name]
+        output = {"loss": total_loss}
+        output.update(loss_dict)
+        return output

models/__pycache__/common.cpython-310.pyc ADDED Viewed

Binary file (3.32 kB). View file

models/__pycache__/content_adapter.cpython-310.pyc ADDED Viewed

Binary file (12 kB). View file

models/__pycache__/diffusion.cpython-310.pyc ADDED Viewed

Binary file (9.77 kB). View file

models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc ADDED Viewed

Binary file (1.04 kB). View file

models/autoencoder/autoencoder_base.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from abc import abstractmethod, ABC
+from typing import Sequence
+import torch
+import torch.nn as nn
+class AutoEncoderBase(ABC):
+    def __init__(
+        self, downsampling_ratio: int, sample_rate: int,
+        latent_shape: Sequence[int | None]
+    ):
+        self.downsampling_ratio = downsampling_ratio
+        self.sample_rate = sample_rate
+        self.latent_token_rate = sample_rate // downsampling_ratio
+        self.latent_shape = latent_shape
+        self.time_dim = latent_shape.index(None) + 1  # the first dim is batch
+    @abstractmethod
+    def encode(
+        self, waveform: torch.Tensor, waveform_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        ...

models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc ADDED Viewed

Binary file (13.4 kB). View file

models/autoencoder/waveform/dac.py ADDED Viewed

File without changes

models/autoencoder/waveform/stable_vae.py ADDED Viewed

	@@ -0,0 +1,586 @@

+from typing import Any, Literal, Callable
+import math
+from pathlib import Path
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+import torchaudio
+from alias_free_torch import Activation1d
+from models.common import LoadPretrainedBase
+from models.autoencoder.autoencoder_base import AutoEncoderBase
+from utils.torch_utilities import remove_key_prefix_factory, create_mask_from_length
+# jit script make it 1.4x faster and save GPU memory
+@torch.jit.script
+def snake_beta(x, alpha, beta):
+    return x + (1.0 / (beta + 0.000000001)) * pow(torch.sin(x * alpha), 2)
+class SnakeBeta(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        alpha=1.0,
+        alpha_trainable=True,
+        alpha_logscale=True
+    ):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:
+            # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
+        else:
+            # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        # self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
+        # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = snake_beta(x, alpha, beta)
+        return x
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+def get_activation(
+    activation: Literal["elu", "snake", "none"],
+    antialias=False,
+    channels=None
+) -> nn.Module:
+    if activation == "elu":
+        act = nn.ELU()
+    elif activation == "snake":
+        act = SnakeBeta(channels)
+    elif activation == "none":
+        act = nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {activation}")
+    if antialias:
+        act = Activation1d(act)
+    return act
+class ResidualUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dilation,
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+        self.dilation = dilation
+        padding = (dilation * (7 - 1)) // 2
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                dilation=dilation,
+                padding=padding
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels
+            ),
+            WNConv1d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=1
+            )
+        )
+    def forward(self, x):
+        res = x
+        #x = checkpoint(self.layers, x)
+        x = self.layers(x)
+        return x + res
+class EncoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+        self.layers = nn.Sequential(
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=1,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=3,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=9,
+                use_snake=use_snake
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2)
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False
+    ):
+        super().__init__()
+        if use_nearest_upsample:
+            upsample_layer = nn.Sequential(
+                nn.Upsample(scale_factor=stride, mode="nearest"),
+                WNConv1d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=2 * stride,
+                    stride=1,
+                    bias=False,
+                    padding='same'
+                )
+            )
+        else:
+            upsample_layer = WNConvTranspose1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2)
+            )
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels
+            ),
+            upsample_layer,
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=1,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=3,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=9,
+                use_snake=use_snake
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class OobleckEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=c_mults[0] * channels,
+                kernel_size=7,
+                padding=3
+            )
+        ]
+        for i in range(self.depth - 1):
+            layers += [
+                EncoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i + 1] * channels,
+                    stride=strides[i],
+                    use_snake=use_snake
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[-1] * channels
+            ),
+            WNConv1d(
+                in_channels=c_mults[-1] * channels,
+                out_channels=latent_dim,
+                kernel_size=3,
+                padding=1
+            )
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class OobleckDecoder(nn.Module):
+    def __init__(
+        self,
+        out_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False,
+        final_tanh=True
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=latent_dim,
+                out_channels=c_mults[-1] * channels,
+                kernel_size=7,
+                padding=3
+            ),
+        ]
+        for i in range(self.depth - 1, 0, -1):
+            layers += [
+                DecoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i - 1] * channels,
+                    stride=strides[i - 1],
+                    use_snake=use_snake,
+                    antialias_activation=antialias_activation,
+                    use_nearest_upsample=use_nearest_upsample
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[0] * channels
+            ),
+            WNConv1d(
+                in_channels=c_mults[0] * channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                padding=3,
+                bias=False
+            ),
+            nn.Tanh() if final_tanh else nn.Identity()
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class Bottleneck(nn.Module):
+    def __init__(self, is_discrete: bool = False):
+        super().__init__()
+        self.is_discrete = is_discrete
+    def encode(self, x, return_info=False, **kwargs):
+        raise NotImplementedError
+    def decode(self, x):
+        raise NotImplementedError
+@torch.jit.script
+def vae_sample(mean, scale) -> dict[str, torch.Tensor]:
+    stdev = nn.functional.softplus(scale) + 1e-4
+    var = stdev * stdev
+    logvar = torch.log(var)
+    latents = torch.randn_like(mean) * stdev + mean
+    kl = (mean * mean + var - logvar - 1).sum(1).mean()
+    return {"latents": latents, "kl": kl}
+class VAEBottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+    def encode(self,
+               x,
+               return_info=False,
+               **kwargs) -> dict[str, torch.Tensor] | torch.Tensor:
+        mean, scale = x.chunk(2, dim=1)
+        sampled = vae_sample(mean, scale)
+        if return_info:
+            return sampled["latents"], {"kl": sampled["kl"]}
+        else:
+            return sampled["latents"]
+    def decode(self, x):
+        return x
+def compute_mean_kernel(x, y):
+    kernel_input = (x[:, None] - y[None]).pow(2).mean(2) / x.shape[-1]
+    return torch.exp(-kernel_input).mean()
+class Pretransform(nn.Module):
+    def __init__(self, enable_grad, io_channels, is_discrete):
+        super().__init__()
+        self.is_discrete = is_discrete
+        self.io_channels = io_channels
+        self.encoded_channels = None
+        self.downsampling_ratio = None
+        self.enable_grad = enable_grad
+    def encode(self, x):
+        raise NotImplementedError
+    def decode(self, z):
+        raise NotImplementedError
+    def tokenize(self, x):
+        raise NotImplementedError
+    def decode_tokens(self, tokens):
+        raise NotImplementedError
+class StableVAE(LoadPretrainedBase, AutoEncoderBase):
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        latent_dim,
+        downsampling_ratio,
+        sample_rate,
+        io_channels=2,
+        bottleneck: Bottleneck = None,
+        pretransform: Pretransform = None,
+        in_channels=None,
+        out_channels=None,
+        soft_clip=False,
+        pretrained_ckpt: str | Path = None
+    ):
+        LoadPretrainedBase.__init__(self)
+        AutoEncoderBase.__init__(
+            self,
+            downsampling_ratio=downsampling_ratio,
+            sample_rate=sample_rate,
+            latent_shape=(latent_dim, None)
+        )
+        self.latent_dim = latent_dim
+        self.io_channels = io_channels
+        self.in_channels = io_channels
+        self.out_channels = io_channels
+        self.min_length = self.downsampling_ratio
+        if in_channels is not None:
+            self.in_channels = in_channels
+        if out_channels is not None:
+            self.out_channels = out_channels
+        self.bottleneck = bottleneck
+        self.encoder = encoder
+        self.decoder = decoder
+        self.pretransform = pretransform
+        self.soft_clip = soft_clip
+        self.is_discrete = self.bottleneck is not None and self.bottleneck.is_discrete
+        self.remove_autoencoder_prefix_fn: Callable = remove_key_prefix_factory(
+            "autoencoder."
+        )
+        if pretrained_ckpt is not None:
+            self.load_pretrained(pretrained_ckpt)
+    def process_state_dict(self, model_dict, state_dict):
+        state_dict = state_dict["state_dict"]
+        state_dict = self.remove_autoencoder_prefix_fn(model_dict, state_dict)
+        return state_dict
+    def encode(
+        self, waveform: torch.Tensor, waveform_lengths: torch.Tensor,pad_latent_len: int = 500
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # import pdb;pdb.set_trace()
+        z = self.encoder(waveform)
+        z = self.bottleneck.encode(z)
+        z_length = waveform_lengths // self.downsampling_ratio
+        z_mask = create_mask_from_length(z_length, max_length=pad_latent_len)
+        B, C, L = z.shape
+        if L < pad_latent_len:
+            pad_size = pad_latent_len - L
+            z = torch.cat([z, torch.zeros(B, C, pad_size, device=z.device, dtype=z.dtype)], dim=-1)
+        return z, z_mask
+    def decode(self, latents: torch.Tensor, latent_mask: torch.Tensor | None = None) -> torch.Tensor:
+        """
+        latents: [B, C, T_latent]
+        latent_mask: [B, T_latent] 可选，1为有效，0为padding
+        """
+        if latent_mask is not None:
+            outputs = []
+            for b in range(latents.size(0)):
+                # 找到当前样本有效的时间步索引
+                valid_idx = latent_mask[b].bool()
+                valid_latents = latents[b, :, valid_idx]  # [C, T_valid]
+                outputs.append(self.decoder(valid_latents.unsqueeze(0)))  # [1, C, T_waveform_valid]
+            return torch.cat(outputs, dim=0)
+        else:
+            return self.decoder(latents)
+        return waveform
+class StableVAEProjectorWrapper(nn.Module):
+    def __init__(
+        self,
+        vae_dim: int,
+        embed_dim: int,
+        model: StableVAE | None = None,
+    ):
+        super().__init__()
+        self.model = model
+        self.proj = nn.Linear(vae_dim, embed_dim)
+    def forward(
+        self, waveform: torch.Tensor, waveform_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        self.model.eval()
+        with torch.no_grad():
+            z, z_mask = self.model.encode(waveform, waveform_lengths, pad_latent_len=500)
+        z = self.proj(z.transpose(1, 2))
+        return {"output": z, "mask": z_mask}
+if __name__ == '__main__':
+    import hydra
+    from utils.config import generate_config_from_command_line_overrides
+    model_config = generate_config_from_command_line_overrides(
+        "../../../configs"
+    )
+    autoencoder: StableVAE = hydra.utils.instantiate(model_config)
+    autoencoder.eval()
+    waveform, sr = torchaudio.load(
+        "/edit/syn_7.wav"
+    )
+    waveform = waveform.mean(0, keepdim=True)
+    waveform = torchaudio.functional.resample(
+        waveform, sr, model_config["sample_rate"]
+    )
+    import soundfile as sf
+    sf.write(
+        "./torch_test.wav",
+        waveform[0].numpy(),
+        samplerate=model_config["sample_rate"]
+    )
+    print("waveform: ", waveform.shape)
+    with torch.no_grad():
+        latent, latent_length = autoencoder.encode(
+            waveform, torch.as_tensor([waveform.shape[-1]])
+        )
+        print("latent: ", latent.shape)
+        print("latent_length: ", latent_length)
+        reconstructed = autoencoder.decode(latent, latent_length)
+        print("reconstructed: ", reconstructed.shape)
+    sf.write(
+        "./reconstructed.wav",
+        reconstructed[0, 0].numpy(),
+        samplerate=model_config["sample_rate"]
+    )

models/common.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from pathlib import Path
+from typing import Sequence
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.torch_utilities import (
+    load_pretrained_model, merge_matched_keys, create_mask_from_length,
+    loss_with_mask, create_alignment_path
+)
+class LoadPretrainedBase(nn.Module):
+    def process_state_dict(
+        self, model_dict: dict[str, torch.Tensor],
+        state_dict: dict[str, torch.Tensor]
+    ):
+        """
+        Custom processing functions of each model that transforms `state_dict` loaded from
+        checkpoints to the state that can be used in `load_state_dict`.
+        Use `merge_mathced_keys` to update parameters with matched names and shapes by
+        default.
+        Args
+            model_dict:
+                The state dict of the current model, which is going to load pretrained parameters
+            state_dict:
+                A dictionary of parameters from a pre-trained model.
+            Returns:
+                dict[str, torch.Tensor]:
+                    The updated state dict, where parameters with matched keys and shape are
+                    updated with values in `state_dict`.
+        """
+        state_dict = merge_matched_keys(model_dict, state_dict)
+        return state_dict
+    def load_pretrained(self, ckpt_path: str | Path):
+        load_pretrained_model(
+            self, ckpt_path, state_dict_process_fn=self.process_state_dict
+        )
+class CountParamsBase(nn.Module):
+    def count_params(self):
+        num_params = 0
+        trainable_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+            if param.requires_grad:
+                trainable_params += param.numel()
+        return num_params, trainable_params
+class SaveTrainableParamsBase(nn.Module):
+    @property
+    def param_names_to_save(self):
+        names = []
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                names.append(name)
+        for name, _ in self.named_buffers():
+            names.append(name)
+        return names
+    def load_state_dict(self, state_dict, strict=True):
+        missing_keys = []
+        for key in self.param_names_to_save:
+            if key not in state_dict:
+                missing_keys.append(key)
+        if strict and len(missing_keys) > 0:
+            raise Exception(
+                f"{missing_keys} not found in either pre-trained models (e.g. BERT) or resumed checkpoints (e.g. epoch_40/model.pt)"
+            )
+        elif len(missing_keys) > 0:
+            print(f"Warning: missing keys {missing_keys}, skipping them.")
+        return super().load_state_dict(state_dict, strict)

models/content_adapter.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import math
+from typing import Any
+import torch
+import torch.nn as nn
+from utils.torch_utilities import concat_non_padding, restore_from_concat, create_mask_from_length
+from models.content_encoder.content_encoder import ContentEncoder
+######################
+# fastspeech modules
+######################
+class LayerNorm(nn.LayerNorm):
+    """Layer normalization module.
+    :param int nout: output dim size
+    :param int dim: dimension to be normalized
+    """
+    def __init__(self, nout, dim=-1):
+        """Construct an LayerNorm object."""
+        super(LayerNorm, self).__init__(nout, eps=1e-12)
+        self.dim = dim
+    def forward(self, x):
+        """Apply layer normalization.
+        :param torch.Tensor x: input tensor
+        :return: layer normalized tensor
+        :rtype torch.Tensor
+        """
+        if self.dim == -1:
+            return super(LayerNorm, self).forward(x)
+        return super(LayerNorm,
+                     self).forward(x.transpose(1, -1)).transpose(1, -1)
+class DurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        filter_channels: int,
+        n_layers: int = 2,
+        kernel_size: int = 3,
+        p_dropout: float = 0.1,
+        padding: str = "SAME"
+    ):
+        super(DurationPredictor, self).__init__()
+        self.conv = nn.ModuleList()
+        self.kernel_size = kernel_size
+        self.padding = padding
+        for idx in range(n_layers):
+            in_chans = in_channels if idx == 0 else filter_channels
+            self.conv += [
+                nn.Sequential(
+                    nn.ConstantPad1d(((kernel_size - 1) // 2,
+                                      (kernel_size - 1) //
+                                      2) if padding == 'SAME' else
+                                     (kernel_size - 1, 0), 0),
+                    nn.Conv1d(
+                        in_chans,
+                        filter_channels,
+                        kernel_size,
+                        stride=1,
+                        padding=0
+                    ), nn.ReLU(), LayerNorm(filter_channels, dim=1),
+                    nn.Dropout(p_dropout)
+                )
+            ]
+        self.linear = nn.Linear(filter_channels, 1)
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor):
+        # x: [B, T, E]
+        x = x.transpose(1, -1)
+        x_mask = x_mask.unsqueeze(1).to(x.device)
+        for f in self.conv:
+            x = f(x)
+            x = x * x_mask.float()
+        x = self.linear(x.transpose(1, -1)
+                       ) * x_mask.transpose(1, -1).float()  # [B, T, 1]
+        return x
+######################
+# adapter modules
+######################
+class ContentAdapterBase(nn.Module):
+    def __init__(self, d_out):
+        super().__init__()
+        self.d_out = d_out
+class SinusoidalPositionalEmbedding(nn.Module):
+    def __init__(self, d_model, dropout, max_len=1000):
+        super().__init__()
+        self.dropout = nn.Dropout(dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() *
+            (-math.log(10000.0) / d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:x.size(1), :]
+        return self.dropout(x)
+class ContentAdapter(ContentAdapterBase):
+    def __init__(
+        self,
+        d_model: int,
+        d_out: int,
+        num_layers: int,
+        num_heads: int,
+        duration_predictor: DurationPredictor,
+        dropout: float = 0.1,
+        norm_first: bool = False,
+        activation: str = "gelu",
+        duration_grad_scale: float = 0.0,
+    ):
+        super().__init__(d_out)
+        self.duration_grad_scale = duration_grad_scale
+        self.cls_embed = nn.Parameter(torch.randn(d_model))
+        if hasattr(torch, "npu") and torch.npu.is_available():
+            enable_nested_tensor = False
+        else:
+            enable_nested_tensor = True
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=num_heads,
+            dim_feedforward=4 * d_model,
+            dropout=dropout,
+            activation=activation,
+            norm_first=norm_first,
+            batch_first=True
+        )
+        self.encoder_layers = nn.TransformerEncoder(
+            encoder_layer=encoder_layer,
+            num_layers=num_layers,
+            enable_nested_tensor=enable_nested_tensor
+        )
+        self.duration_predictor = duration_predictor
+        self.content_proj = nn.Conv1d(d_model, d_out, 1)
+    def forward(self, x, x_mask):
+        batch_size = x.size(0)
+        cls_embed = self.cls_embed.reshape(1, -1).expand(batch_size, -1)
+        cls_embed = cls_embed.to(x.device).unsqueeze(1)
+        x = torch.cat([cls_embed, x], dim=1)
+        cls_mask = torch.ones(batch_size, 1).to(x_mask.device)
+        x_mask = torch.cat([cls_mask, x_mask], dim=1)
+        x = self.encoder_layers(x, src_key_padding_mask=~x_mask.bool())
+        x_grad_rescaled = x * self.duration_grad_scale + x.detach(
+        ) * (1 - self.duration_grad_scale)
+        duration = self.duration_predictor(x_grad_rescaled, x_mask).squeeze(-1)
+        content = self.content_proj(x.transpose(1, 2)).transpose(1, 2)
+        return content[:, 1:], x_mask[:, 1:], duration[:, 0], duration[:, 1:]
+class PrefixAdapter(ContentAdapterBase):
+    def __init__(
+        self,
+        content_dim: int,
+        d_model: int,
+        d_out: int,
+        prefix_dim: int,
+        num_layers: int,
+        num_heads: int,
+        duration_predictor: DurationPredictor,
+        dropout: float = 0.1,
+        norm_first: bool = False,
+        use_last_norm: bool = True,
+        activation: str = "gelu",
+        duration_grad_scale: float = 0.1,
+    ):
+        super().__init__(d_out)
+        self.duration_grad_scale = duration_grad_scale
+        self.prefix_mlp = nn.Sequential(
+            nn.Linear(prefix_dim, d_model), nn.ReLU(), nn.Dropout(dropout),
+            nn.Linear(d_model, d_model)
+        )
+        self.content_mlp = nn.Sequential(
+            nn.Linear(content_dim, d_model), nn.ReLU(), nn.Dropout(dropout),
+            nn.Linear(d_model, d_model)
+        )
+        layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=num_heads,
+            dim_feedforward=4 * d_model,
+            dropout=dropout,
+            activation=activation,
+            batch_first=True,
+            norm_first=norm_first
+        )
+        if hasattr(torch, "npu") and torch.npu.is_available():
+            enable_nested_tensor = False
+        else:
+            enable_nested_tensor = True
+        self.cls_embed = nn.Parameter(torch.randn(d_model))
+        # self.pos_embed = SinusoidalPositionalEmbedding(d_model, dropout)
+        self.layers = nn.TransformerEncoder(
+            encoder_layer=layer,
+            num_layers=num_layers,
+            enable_nested_tensor=enable_nested_tensor
+        )
+        self.use_last_norm = use_last_norm
+        if self.use_last_norm:
+            self.last_norm = nn.LayerNorm(d_model)
+        self.duration_predictor = duration_predictor
+        self.content_proj = nn.Conv1d(d_model, d_out, 1)
+        nn.init.normal_(self.cls_embed, 0., 0.02)
+        nn.init.xavier_uniform_(self.content_proj.weight)
+        nn.init.constant_(self.content_proj.bias, 0.)
+    def forward(self, content, content_mask, instruction, instruction_mask):
+        batch_size = content.size(0)
+        cls_embed = self.cls_embed.reshape(1, -1).expand(batch_size, -1)
+        cls_embed = cls_embed.to(content.device).unsqueeze(1)
+        content = self.content_mlp(content)
+        x = torch.cat([cls_embed, content], dim=1)
+        cls_mask = torch.ones(batch_size, 1,
+                              dtype=bool).to(content_mask.device)
+        x_mask = torch.cat([cls_mask, content_mask], dim=1)
+        prefix = self.prefix_mlp(instruction)
+        seq, seq_mask, perm = concat_non_padding(
+            prefix, instruction_mask, x, x_mask
+        )
+        # seq = self.pos_embed(seq)
+        x = self.layers(seq, src_key_padding_mask=~seq_mask.bool())
+        if self.use_last_norm:
+            x = self.last_norm(x)
+        _, x = restore_from_concat(x, instruction_mask, x_mask, perm)
+        x_grad_rescaled = x * self.duration_grad_scale + x.detach(
+        ) * (1 - self.duration_grad_scale)
+        duration = self.duration_predictor(x_grad_rescaled, x_mask).squeeze(-1)
+        content = self.content_proj(x.transpose(1, 2)).transpose(1, 2)
+        return content[:, 1:], x_mask[:, 1:], duration[:, 0], duration[:, 1:]
+class CrossAttentionAdapter(ContentAdapterBase):
+    def __init__(
+        self,
+        d_out: int,
+        content_dim: int,
+        prefix_dim: int,
+        num_heads: int,
+        duration_predictor: DurationPredictor,
+        dropout: float = 0.1,
+        duration_grad_scale: float = 0.1,
+    ):
+        super().__init__(d_out)
+        self.attn = nn.MultiheadAttention(
+            embed_dim=content_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            kdim=prefix_dim,
+            vdim=prefix_dim,
+            batch_first=True,
+        )
+        self.duration_grad_scale = duration_grad_scale
+        self.duration_predictor = duration_predictor
+        self.global_duration_mlp = nn.Sequential(
+            nn.Linear(content_dim, content_dim), nn.ReLU(),
+            nn.Dropout(dropout), nn.Linear(content_dim, 1)
+        )
+        self.norm = nn.LayerNorm(content_dim)
+        self.content_proj = nn.Conv1d(content_dim, d_out, 1)
+    def forward(self, content, content_mask, prefix, prefix_mask):
+        attn_output, attn_output_weights = self.attn(
+            query=content,
+            key=prefix,
+            value=prefix,
+            key_padding_mask=~prefix_mask.bool()
+        )
+        attn_output = attn_output * content_mask.unsqueeze(-1).float()
+        x = self.norm(attn_output + content)
+        x_grad_rescaled = x * self.duration_grad_scale + x.detach(
+        ) * (1 - self.duration_grad_scale)
+        x_aggregated = (x_grad_rescaled * content_mask.unsqueeze(-1).float()
+                       ).sum(dim=1) / content_mask.sum(dim=1,
+                                                       keepdim=True).float()
+        global_duration = self.global_duration_mlp(x_aggregated).squeeze(-1)
+        local_duration = self.duration_predictor(
+            x_grad_rescaled, content_mask
+        ).squeeze(-1)
+        content = self.content_proj(x.transpose(1, 2)).transpose(1, 2)
+        return content, content_mask, global_duration, local_duration
+class ExperimentalCrossAttentionAdapter(ContentAdapterBase):
+    def __init__(
+        self,
+        d_out: int,
+        content_dim: int,
+        prefix_dim: int,
+        num_heads: int,
+        duration_predictor: DurationPredictor,
+        dropout: float = 0.1,
+        duration_grad_scale: float = 0.1,
+    ):
+        super().__init__(d_out)
+        self.content_mlp = nn.Sequential(
+            nn.Linear(content_dim, content_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(content_dim, content_dim),
+        )
+        self.content_norm = nn.LayerNorm(content_dim)
+        self.prefix_mlp = nn.Sequential(
+            nn.Linear(prefix_dim, prefix_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(prefix_dim, prefix_dim),
+        )
+        self.prefix_norm = nn.LayerNorm(content_dim)
+        self.attn = nn.MultiheadAttention(
+            embed_dim=content_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            kdim=prefix_dim,
+            vdim=prefix_dim,
+            batch_first=True,
+        )
+        self.duration_grad_scale = duration_grad_scale
+        self.duration_predictor = duration_predictor
+        self.global_duration_mlp = nn.Sequential(
+            nn.Linear(content_dim, content_dim), nn.ReLU(),
+            nn.Dropout(dropout), nn.Linear(content_dim, 1)
+        )
+        self.content_proj = nn.Sequential(
+            nn.Linear(content_dim, d_out),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_out, d_out),
+        )
+        self.norm1 = nn.LayerNorm(content_dim)
+        self.norm2 = nn.LayerNorm(d_out)
+        self.init_weights()
+    def init_weights(self):
+        def _init_weights(module):
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0.)
+        self.apply(_init_weights)
+    def forward(self, content, content_mask, prefix, prefix_mask):
+        content = self.content_mlp(content)
+        content = self.content_norm(content)
+        prefix = self.prefix_mlp(prefix)
+        prefix = self.prefix_norm(prefix)
+        attn_output, attn_weights = self.attn(
+            query=content,
+            key=prefix,
+            value=prefix,
+            key_padding_mask=~prefix_mask.bool(),
+        )
+        attn_output = attn_output * content_mask.unsqueeze(-1).float()
+        x = attn_output + content
+        x = self.norm1(x)
+        x_grad_rescaled = x * self.duration_grad_scale + x.detach(
+        ) * (1 - self.duration_grad_scale)
+        x_aggregated = (x_grad_rescaled * content_mask.unsqueeze(-1).float()
+                       ).sum(dim=1) / content_mask.sum(dim=1,
+                                                       keepdim=True).float()
+        global_duration = self.global_duration_mlp(x_aggregated).squeeze(-1)
+        local_duration = self.duration_predictor(
+            x_grad_rescaled, content_mask
+        ).squeeze(-1)
+        content = self.content_proj(x)
+        content = self.norm2(content)
+        return content, content_mask, global_duration, local_duration
+class ContentEncoderAdapterMixin:
+    def __init__(
+        self,
+        content_encoder: ContentEncoder,
+        content_adapter: ContentAdapterBase | None = None
+    ):
+        self.content_encoder = content_encoder
+        self.content_adapter = content_adapter
+    def encode_content(
+        self,
+        content: list[Any],
+        task: list[str],
+        device: str | torch.device,
+        instruction: torch.Tensor | None = None,
+        instruction_lengths: torch.Tensor | None = None
+    ):
+        content_output: dict[
+            str, torch.Tensor] = self.content_encoder.encode_content(
+                content, task, device=device
+            )
+        content, content_mask = content_output["content"], content_output[
+            "content_mask"]
+        if instruction is not None:
+            instruction_mask = create_mask_from_length(instruction_lengths)
+            (
+                content,
+                content_mask,
+                global_duration_pred,
+                local_duration_pred,
+            ) = self.content_adapter(
+                content, content_mask, instruction, instruction_mask
+            )
+        return_dict = {
+            "content": content,
+            "content_mask": content_mask,
+            "length_aligned_content": content_output["length_aligned_content"],
+        }
+        if instruction is not None:
+            return_dict["global_duration_pred"] = global_duration_pred
+            return_dict["local_duration_pred"] = local_duration_pred
+        return return_dict

models/content_encoder/__pycache__/content_encoder.cpython-310.pyc ADDED Viewed

Binary file (3.46 kB). View file

models/content_encoder/__pycache__/llm_encoder.cpython-310.pyc ADDED Viewed

Binary file (6.16 kB). View file

models/content_encoder/content_encoder.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from typing import Any
+import torch
+import torch.nn as nn
+class ContentEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        text_encoder: nn.Module = None,
+        llm_encoder: nn.Module = None,
+        video_encoder: nn.Module = None,
+        midi_encoder: nn.Module = None,
+        phoneme_encoder: nn.Module = None,
+        pitch_encoder: nn.Module = None,
+        audio_encoder: nn.Module = None
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.text_encoder = text_encoder
+        self.midi_encoder = midi_encoder
+        self.phoneme_encoder = phoneme_encoder
+        self.pitch_encoder = pitch_encoder
+        self.audio_encoder = audio_encoder
+        self.video_encoder = video_encoder
+    def encode_content(
+        self, batch_content: list[Any], batch_task: list[str],
+        device: str | torch.device
+    ):
+        batch_content_output = []
+        batch_content_mask = []
+        batch_la_content_output = []
+        batch_la_content_output_mask = []
+        zero_la_content = torch.zeros(1, 1, self.embed_dim, device=device)
+        for i,(content, task) in enumerate(zip(batch_content, batch_task)):
+            if task == "audio_editing":
+                raw_waveform = torch.as_tensor(content["audio"]).float()
+                waveform_with_batch_dim = raw_waveform.unsqueeze(0).to(device)
+                waveform_lengths = torch.as_tensor([raw_waveform.shape[0]])
+                # Note: text encoder actually is audiollm encoder, encode both waveform and caption
+                content_output_dict = self.text_encoder(
+                    [content["caption"]], waveform_with_batch_dim
+                )
+                audio_dict = {
+                        "waveform": waveform_with_batch_dim,
+                        "waveform_lengths": waveform_lengths
+                    }
+                audio_output_dict = self.audio_encoder(**audio_dict)
+                la_content_output_dict = {
+                    "output": audio_output_dict["output"],
+                    "mask": audio_output_dict["mask"]
+                }
+            batch_content_output.append(content_output_dict["output"][0])
+            batch_content_mask.append(content_output_dict["mask"][0])
+            batch_la_content_output.append(la_content_output_dict["output"][0])
+            batch_la_content_output_mask.append(
+                la_content_output_dict.get("mask", zero_la_content)[0]
+            )
+        batch_content_output = nn.utils.rnn.pad_sequence(
+            batch_content_output, batch_first=True, padding_value=0
+        )
+        batch_content_mask = nn.utils.rnn.pad_sequence(
+            batch_content_mask, batch_first=True, padding_value=False
+        )
+        batch_la_content_output = nn.utils.rnn.pad_sequence(
+            batch_la_content_output, batch_first=True, padding_value=0
+        )
+        batch_la_content_output_mask = nn.utils.rnn.pad_sequence(
+            batch_la_content_output_mask, batch_first=True, padding_value=False
+        )
+        return {
+            "content": batch_content_output ,
+            "content_mask": batch_content_mask,
+            "length_aligned_content": batch_la_content_output,
+            "time_aligned_content_mask": batch_la_content_output_mask
+        }
+class BatchedContentEncoder(ContentEncoder):
+    def encode_content(
+        self, batch_content: list[dict], batch_task: list[str],
+        device: str | torch.device
+    ):
+        assert all(task == "audio_editing" for task in batch_task), \
+            "BatchedContentEncoder now are only support audio_editing"
+        zero_la_content = torch.zeros(1, 1, self.embed_dim, device=device)
+        captions = []
+        waveforms = []
+        waveform_lengths = []
+        for content in batch_content:
+            raw_waveform = torch.as_tensor(content["audio"]).float().to(device)
+            captions.append(content["caption"])
+            waveforms.append(raw_waveform)
+            waveform_lengths.append(raw_waveform.shape[0])
+        content_output_dict = self.text_encoder(
+            captions, waveforms
+        )
+        batch_la_content_output = []
+        batch_la_content_output_mask = []
+        for i in range(len(batch_content)):
+            audio_dict = {
+                "waveform": waveforms[i].unsqueeze(0),
+                "waveform_lengths": torch.as_tensor([waveform_lengths[i]], device=device)
+            }
+            audio_output_dict = self.audio_encoder(**audio_dict)
+            batch_la_content_output.append(audio_output_dict["output"][0])
+            batch_la_content_output_mask.append(audio_output_dict["mask"][0])
+        # pad audio_encoder
+        batch_la_content_output = nn.utils.rnn.pad_sequence(
+            batch_la_content_output, batch_first=True, padding_value=0
+        )
+        batch_la_content_output_mask = nn.utils.rnn.pad_sequence(
+            batch_la_content_output_mask, batch_first=True, padding_value=False
+        )
+        return {
+            "content": content_output_dict["output"],
+            "content_mask": content_output_dict["mask"],
+            "length_aligned_content": batch_la_content_output,
+            "time_aligned_content_mask": batch_la_content_output_mask
+        }

models/content_encoder/llm_encoder.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import torch
+import torch.nn as nn
+import librosa
+import numpy as np
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+import os
+# 暂未使用，原始应该是生成的pre
+QWEN_AUDIO_PREFIX = '''Given a user prompt and an audio clip, generate an "Enhanced prompt" that provides detailed descriptions suitable for audio generation. Evaluate the audio and user prompt:
+- If the prompt is simple, focus on adding specifics about tones, instruments, rhythms, tempos, and audio characteristics to create vivid and concrete audio descriptions.
+- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.\n
+Here are examples of how to transform or refine prompts:
+- User Prompt: Piano music -> Enhanced: A gentle, melancholic piano piece with delicate arpeggios in a minor key, featuring subtle reverb that creates a sense of space and intimacy.
+- User Prompt: City sounds -> Enhanced: A bustling urban soundscape with distant traffic noise, occasional car horns, footsteps on concrete sidewalks, and the murmur of crowd conversations, with subtle pigeons cooing in the background.\n
+Please generate only the enhanced description for the audio and prompt below and avoid including any additional commentary or evaluations:
+User Prompt:'''
+class Qwen2AudioEmbedder(nn.Module):
+    def __init__(self, model_path, embed_dim=256, max_length=320, dtype=torch.float, device="cuda"):
+        super().__init__()
+        self.max_length = max_length
+        self.device = device
+        self.embed_dim = embed_dim
+        self.model = Qwen2AudioForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=dtype,
+            device_map={"": int(os.environ.get("LOCAL_RANK", 0))}
+        )
+        # 禁止梯度回传
+        self.model.requires_grad_(False)
+        self.model.eval()
+        self.processor = AutoProcessor.from_pretrained(model_path)
+        # 添加投影层，从模型隐藏层维度(4096)映射到指定的embed_dim
+        # 按理来说这一层也是会加入训练的呀
+        self.proj = nn.Linear(4096, embed_dim, device=device, dtype=dtype)
+        self.prefix = QWEN_AUDIO_PREFIX
+    def forward(self, text, audio_data):
+        """
+        Args:
+            text: 文本描述列表
+            audio_data: 音频数据列表,每个元素是numpy数组
+        Returns:
+            字典包含 "output": 嵌入张量, "mask": 掩码张量
+        """
+        output, mask = self.encode(text, audio_data)
+        output = self.projection(output)
+        return {"output": output, "mask": mask}
+    def encode(self, text, audio_data):
+        """编码文本和音频到嵌入空间"""
+        """编码文本和音频到嵌入空间"""
+        batch_size = len(text)
+        # 统一转换采样率 (如果需要的话) - 这一步应该在外部或这里批量处理
+        processed_audios = []
+        for audio in audio_data:
+            if isinstance(audio, torch.Tensor):
+                audio = audio.cpu().numpy()
+            # 添加librosa.resample 操作
+            audio=librosa.resample(audio, orig_sr=24000, target_sr=16000)
+            processed_audios.append(audio)
+        # 批量构建对话文本
+        conversations = []
+        for txt in text:
+            conversation = [
+                {"role": "user", "content": [
+                    # 注意：此处audio字段先用None占位，后面再由processor处理
+                    {"type": "audio", "audio": None},
+                    {"type": "text", "text": txt}
+                ]}
+            ]
+            # 使用 apply_chat_template 转换文本
+            formatted_text = self.processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+            conversations.append(formatted_text)
+        with torch.no_grad():
+            # 一次性批量处理整个batch的文本和音频
+            # processor会自动对音频数据进行填充
+            # padding的话是这里padding
+            inputs = self.processor(
+                text=conversations,
+                audio=processed_audios,
+                return_tensors="pt",
+                sampling_rate=16000,
+                padding=True,
+                truncation=True  # 确保不会超过模型最大长度
+            )
+            # 将输入移动到设备
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # 获取模型输出
+            outputs = self.model(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                input_features=inputs["input_features"],
+                feature_attention_mask=inputs["feature_attention_mask"],
+                output_hidden_states=True,
+            )
+            # 提取最后一层隐藏状态
+            hidden_states_full = outputs.hidden_states[-1]
+            # 裁剪到最大长度
+            # ���量处理后，所有样本的长度都已对齐，所以可以直接切片
+            # embs = hidden_states_full[:, :self.max_length, :]
+            # masks = inputs["attention_mask"][:, :self.max_length].bool() # attention_mask可以直接作为布尔掩码使用
+            # --- 核心修改：确保输出长度固定为 self.max_length ---
+            # 1. 截断或填充隐藏状态
+            current_len = hidden_states_full.shape[1]
+            if current_len > self.max_length:
+                embs = hidden_states_full[:, :self.max_length, :]
+            else:
+                pad_width = self.max_length - current_len
+                # 创建一个(batch_size, pad_width, hidden_size)的零张量用于填充
+                padding = torch.zeros(
+                    hidden_states_full.shape[0],
+                    pad_width,
+                    hidden_states_full.shape[2],
+                    device=self.device,
+                    dtype=hidden_states_full.dtype
+                )
+                embs = torch.cat([hidden_states_full, padding], dim=1)
+            # 2. 截断或填充掩码
+            attention_mask = inputs["attention_mask"]
+            if current_len > self.max_length:
+                masks = attention_mask[:, :self.max_length].bool()
+            else:
+                pad_width = self.max_length - current_len
+                # 创建一个(batch_size, pad_width)的False掩码
+                mask_padding = torch.zeros(
+                    attention_mask.shape[0],
+                    pad_width,
+                    device=self.device,
+                    dtype=torch.bool
+                )
+                masks = torch.cat([attention_mask.bool(), mask_padding], dim=1)
+        return embs, masks
+    def projection(self, x):
+        """将嵌入映射到指定维度"""
+        return self.proj(x)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Test Qwen Audio Encoder")
+    parser.add_argument("--model_path", type=str, default="/mnt/petrelfs/taoye/workspace/model/qwen25audio",
+                        help="Path to Qwen Audio model")
+    parser.add_argument("--embed_dim", type=int, default=4096,
+                        help="Target embedding dimension after projection")
+    args = parser.parse_args()
+    print(f"Loading model from {args.model_path}...")
+    # 初始化编码器
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    embedder = Qwen2AudioEmbedder(
+        model_path=args.model_path,
+        embed_dim=args.embed_dim,
+        max_length=640,
+        dtype=torch.float,
+        device=device
+    )
+    # 准备测试批次
+    captions = [
+        "Describe this audio",
+        "What musical instruments are being played in this recording?"
+    ]
+    # 直接加载音频数据
+    audio_path = "/mnt/petrelfs/taoye/workspace/editing/data/add/add_fore_audio_caps_begin_1/audio/edit/syn_5.wav"
+    audio_data = []
+    for _ in range(len(captions)):
+        waveform, sr = librosa.load(audio_path,sr=24000)
+        # print(sr)
+        audio_data.append(waveform)
+    # 获取嵌入
+    with torch.no_grad():
+        output = embedder(captions, audio_data)
+    # 打印结果
+    print("模型输出的字典：")
+    print(f"包含keys: {list(output.keys())}")
+    print("\n输出张量的形状：")
+    print(output['output'].shape)
+    print("\n掩码张量的形状：")
+    print(output['mask'].shape)
+    # 验证嵌入维度是否符合预期
+    assert output['output'].shape[-1] == args.embed_dim, f"输出维度 {output['output'].shape[-1]} 不等于预期维度 {args.embed_dim}"
+    print(f"\n成功验证：输出维度 = {args.embed_dim}")
+    # 显示样本嵌入值
+    print(f"样本嵌入值:\n{output['output'][0, :5, :5]}")
+    print(f"非零掩码位置数量: {output['mask'][0,:]}")
+    # 显示第一个样本中非零掩码位置的数量
+    print(f"第一个样本的非零掩码位置数量: {output['mask'][0].sum().item()}")

models/content_encoder/text_encoder.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel, T5Tokenizer, T5EncoderModel
+from transformers.modeling_outputs import BaseModelOutput
+DEVICE_TYPE = "cuda"
+class TransformersTextEncoderBase(nn.Module):
+    def __init__(self, model_name: str, embed_dim: int):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+        self.proj = nn.Linear(self.model.config.hidden_size, embed_dim)
+    def forward(
+        self,
+        text: list[str],
+    ):
+        output, mask = self.encode(text)
+        output = self.projection(output)
+        return {"output": output, "mask": mask}
+    def encode(self, text: list[str]):
+        device = self.model.device
+        batch = self.tokenizer(
+            text,
+            max_length=self.tokenizer.model_max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids = batch.input_ids.to(device)
+        attention_mask = batch.attention_mask.to(device)
+        output: BaseModelOutput = self.model(
+            input_ids=input_ids, attention_mask=attention_mask
+        )
+        output = output.last_hidden_state
+        mask = (attention_mask == 1).to(device)
+        return output, mask
+    def projection(self, x):
+        return self.proj(x)
+class T5TextEncoder(TransformersTextEncoderBase):
+    def __init__(
+        self, embed_dim: int, model_name: str = "google/flan-t5-large"
+    ):
+        nn.Module.__init__(self)
+        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
+        self.model = T5EncoderModel.from_pretrained(model_name)
+        for param in self.model.parameters():
+            param.requires_grad = False
+        self.model.eval()
+        self.proj = nn.Linear(self.model.config.hidden_size, embed_dim)
+    def encode(
+        self,
+        text: list[str],
+    ):
+        with torch.no_grad(), torch.amp.autocast(
+            device_type=DEVICE_TYPE, enabled=False
+        ):
+            return super().encode(text)
+if __name__ == "__main__":
+    text_encoder = T5TextEncoder(embed_dim=512)
+    text = ["a man is speaking", "a woman is singing while a dog is barking"]
+    output = text_encoder(text)
+    print(output)
+    print(output['output'].shape)
+    print(output['mask'].shape)

models/diffusion.py ADDED Viewed

	@@ -0,0 +1,401 @@

+from typing import Sequence
+import random
+from typing import Any
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import diffusers.schedulers as noise_schedulers
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils.torch_utils import randn_tensor
+from models.autoencoder.autoencoder_base import AutoEncoderBase
+from models.content_encoder.content_encoder import ContentEncoder
+from models.content_adapter import ContentAdapterBase, ContentEncoderAdapterMixin
+import soundfile as sf
+from models.common import (
+    LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase,
+)
+from utils.torch_utilities import (
+    create_alignment_path, create_mask_from_length, loss_with_mask,
+    trim_or_pad_length
+)
+class DiffusionMixin:
+    def __init__(
+        self,
+        noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1",
+        snr_gamma: float = None,
+        cfg_drop_ratio: float = 0.2
+    ) -> None:
+        self.noise_scheduler_name = noise_scheduler_name
+        self.snr_gamma = snr_gamma
+        self.classifier_free_guidance = cfg_drop_ratio > 0.0
+        self.cfg_drop_ratio = cfg_drop_ratio
+        self.noise_scheduler = noise_schedulers.DDPMScheduler.from_pretrained(
+            self.noise_scheduler_name, subfolder="scheduler"
+        )
+    def compute_snr(self, timesteps) -> torch.Tensor:
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod)**0.5
+        # Expand the tensors.
+        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device
+                                                    )[timesteps].float()
+        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
+            device=timesteps.device
+        )[timesteps].float()
+        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
+                                                                          None]
+        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+        # Compute SNR.
+        snr = (alpha / sigma)**2
+        return snr
+    def get_timesteps(
+        self,
+        batch_size: int,
+        device: torch.device,
+        training: bool = True
+    ) -> torch.Tensor:
+        if training:
+            timesteps = torch.randint(
+                0,
+                self.noise_scheduler.config.num_train_timesteps,
+                (batch_size, ),
+                device=device
+            )
+        else:
+            # validation on half of the total timesteps
+            timesteps = (self.noise_scheduler.config.num_train_timesteps //
+                         2) * torch.ones((batch_size, ),
+                                         dtype=torch.int64,
+                                         device=device)
+        timesteps = timesteps.long()
+        return timesteps
+    def get_input_target_and_timesteps(
+        self,
+        latent: torch.Tensor,
+        training: bool,
+    ):
+        batch_size = latent.shape[0]
+        device = latent.device
+        num_train_timesteps = self.noise_scheduler.config.num_train_timesteps
+        self.noise_scheduler.set_timesteps(num_train_timesteps, device=device)
+        timesteps = self.get_timesteps(batch_size, device, training=training)
+        noise = torch.randn_like(latent)
+        noisy_latent = self.noise_scheduler.add_noise(latent, noise, timesteps)
+        target = self.get_target(latent, noise, timesteps)
+        return noisy_latent, target, timesteps
+    def get_target(
+        self, latent: torch.Tensor, noise: torch.Tensor,
+        timesteps: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Get the target for loss depending on the prediction type
+        """
+        if self.noise_scheduler.config.prediction_type == "epsilon":
+            target = noise
+        elif self.noise_scheduler.config.prediction_type == "v_prediction":
+            target = self.noise_scheduler.get_velocity(
+                latent, noise, timesteps
+            )
+        else:
+            raise ValueError(
+                f"Unknown prediction type {self.noise_scheduler.config.prediction_type}"
+            )
+        return target
+    def loss_with_snr(
+        self,
+        pred: torch.Tensor,
+        target: torch.Tensor,
+        timesteps: torch.Tensor,
+        mask: torch.Tensor,
+        reduce: bool = True
+    ) -> torch.Tensor:
+        if self.snr_gamma is None:
+            loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+            loss = loss_with_mask(loss, mask, reduce=reduce)
+        else:
+            # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+            # Adapted from https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py#L1006
+            snr = self.compute_snr(timesteps)
+            mse_loss_weights = torch.stack(
+                [
+                    snr,
+                    self.snr_gamma * torch.ones_like(timesteps),
+                ],
+                dim=1,
+            ).min(dim=1)[0]
+            # division by (snr + 1) does not work well, not clear about the reason
+            mse_loss_weights = mse_loss_weights / snr
+            loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+            loss = loss_with_mask(loss, mask, reduce=False) * mse_loss_weights
+            if reduce:
+                loss = loss.mean()
+        return loss
+    def rescale_cfg(
+        self, pred_cond: torch.Tensor, pred_cfg: torch.Tensor,
+        guidance_rescale: float
+    ):
+        """
+        Rescale `pred_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+        Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+        """
+        std_cond = pred_cond.std(
+            dim=list(range(1, pred_cond.ndim)), keepdim=True
+        )
+        std_cfg = pred_cfg.std(dim=list(range(1, pred_cfg.ndim)), keepdim=True)
+        pred_rescaled = pred_cfg * (std_cond / std_cfg)
+        pred_cfg = guidance_rescale * pred_rescaled + (
+            1 - guidance_rescale
+        ) * pred_cfg
+        return pred_cfg
+class SingleTaskCrossAttentionAudioDiffusion(
+    LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase,
+    DiffusionMixin, ContentEncoderAdapterMixin
+):
+    def __init__(
+        self,
+        autoencoder: AutoEncoderBase,
+        content_encoder: ContentEncoder,
+        backbone: nn.Module,
+        content_dim: int,
+        noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1",
+        snr_gamma: float = None,
+        cfg_drop_ratio: float = 0.2,
+    ):
+        nn.Module.__init__(self)
+        DiffusionMixin.__init__(
+            self, noise_scheduler_name, snr_gamma, cfg_drop_ratio
+        )
+        ContentEncoderAdapterMixin.__init__(
+            self, content_encoder=content_encoder
+        )
+        self.autoencoder = autoencoder
+        for param in self.autoencoder.parameters():
+            param.requires_grad = False
+        if hasattr(self.content_encoder, "audio_encoder"):
+            self.content_encoder.audio_encoder.model = self.autoencoder
+        self.backbone = backbone
+        self.dummy_param = nn.Parameter(torch.empty(0))
+    def forward(
+        self, content: list[Any],  task: list[str],
+        waveform: torch.Tensor, waveform_lengths: torch.Tensor, **kwargs
+    ):
+        device = self.dummy_param.device
+        self.autoencoder.eval()
+        self.content_encoder.eval()
+        with torch.no_grad():
+            latent, latent_mask = self.autoencoder.encode(
+                waveform.unsqueeze(1), waveform_lengths,pad_latent_len=500
+            )
+        with torch.no_grad():
+            content_dict = self.content_encoder.encode_content(content, task, device)
+        context, context_mask = content_dict["content"], content_dict[
+            "content_mask"]
+        time_aligned_content = content_dict["length_aligned_content"]
+        time_aligned_content_mask = content_dict[
+            "time_aligned_content_mask"
+        ]
+        latent_mask = time_aligned_content_mask.to(device)
+        if self.training and self.classifier_free_guidance:
+            mask_indices = [
+                k for k in range(len(waveform))
+                if random.random() < self.cfg_drop_ratio
+            ]
+            if len(mask_indices) > 0:
+                context[mask_indices] = 0
+                # dont mask!
+                # time_aligned_content[mask_indices] = 0
+        noisy_latent, target, timesteps = self.get_input_target_and_timesteps(
+            latent, self.training
+        )
+        pred: torch.Tensor = self.backbone(
+            x=noisy_latent,
+            timesteps=timesteps,
+            time_aligned_context=time_aligned_content,
+            context=context,
+            x_mask=latent_mask,
+            context_mask=context_mask
+        )
+        pred = pred.transpose(1, self.autoencoder.time_dim)
+        target = target.transpose(1, self.autoencoder.time_dim)
+        loss = self.loss_with_snr(pred, target, timesteps, latent_mask)
+        return loss
+    def prepare_latent(
+        self, batch_size: int, scheduler: SchedulerMixin,
+        latent_shape: Sequence[int], dtype: torch.dtype, device: str
+    ):
+        shape = (batch_size, *latent_shape)
+        latent = randn_tensor(
+            shape, generator=None, device=device, dtype=dtype
+        )
+        # scale the initial noise by the standard deviation required by the scheduler
+        latent = latent * scheduler.init_noise_sigma
+        return latent
+    def iterative_denoise(
+        self,
+        latent: torch.Tensor,
+        scheduler: SchedulerMixin,
+        verbose: bool,
+        cfg: bool,
+        cfg_scale: float,
+        cfg_rescale: float,
+        backbone_input: dict,
+    ):
+        timesteps = scheduler.timesteps
+        num_steps = len(timesteps)
+        num_warmup_steps = len(timesteps) - num_steps * scheduler.order
+        progress_bar = tqdm(range(num_steps), disable=not verbose)
+        for i, timestep in enumerate(timesteps):
+            # expand the latent if we are doing classifier free guidance
+            if cfg:
+                latent_input = torch.cat([latent, latent])
+            else:
+                latent_input = latent
+            latent_input = scheduler.scale_model_input(latent_input, timestep)
+            # print(latent_input.shape)
+            noise_pred = self.backbone(
+                x=latent_input, timesteps=timestep, **backbone_input
+            )
+            # perform guidance
+            if cfg:
+                noise_pred_uncond, noise_pred_content = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + cfg_scale * (
+                    noise_pred_content - noise_pred_uncond
+                )
+                if cfg_rescale != 0.0:
+                    noise_pred = self.rescale_cfg(
+                        noise_pred_content, noise_pred, cfg_rescale
+                    )
+            # compute the previous noisy sample x_t -> x_t-1
+            latent = scheduler.step(noise_pred, timestep, latent).prev_sample
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
+                                           (i + 1) % scheduler.order == 0):
+                progress_bar.update(1)
+        progress_bar.close()
+        return latent
+    @torch.no_grad()
+    def inference(
+        self,
+        content: list[Any],
+        task: list[str],
+        scheduler: SchedulerMixin,
+        num_steps: int = 50,
+        guidance_scale: float = 3.0,
+        guidance_rescale: float = 0.0,
+        disable_progress: bool = True,
+        mask_time_aligned_content: bool = True,  # 新增参数
+        **kwargs
+    ):
+        device = self.dummy_param.device
+        classifier_free_guidance = guidance_scale > 1.0
+        batch_size = len(content)
+        content_dict = self.content_encoder.encode_content(content, task, device)
+        context, context_mask = content_dict["content"], content_dict[
+            "content_mask"]
+        time_aligned_content = content_dict["length_aligned_content"]
+        time_aligned_content_mask = content_dict[
+            "time_aligned_content_mask"
+        ]
+        B, T, C = time_aligned_content.shape
+        latent_shape = (C, T)      # 128, 500
+        latent_mask=time_aligned_content_mask.to(device)
+        if classifier_free_guidance:
+            if mask_time_aligned_content:
+                uncond_time_aligned_content = torch.zeros_like(time_aligned_content)
+            else:
+                uncond_time_aligned_content = time_aligned_content.detach().clone()
+            uncond_context = torch.zeros_like(context)
+            uncond_context_mask = context_mask.detach().clone()
+            time_aligned_content = torch.cat([
+                uncond_time_aligned_content, time_aligned_content
+            ])
+            context = torch.cat([uncond_context, context])
+            context_mask = torch.cat([uncond_context_mask, context_mask])
+            latent_mask = torch.cat([
+                latent_mask, latent_mask.detach().clone()
+            ])
+        scheduler.set_timesteps(num_steps, device=device)
+        latent = self.prepare_latent(
+            batch_size, scheduler, latent_shape, context.dtype, device
+        )
+        latent = self.iterative_denoise(
+            latent=latent,
+            scheduler=scheduler,
+            verbose=not disable_progress,
+            cfg=classifier_free_guidance,
+            cfg_scale=guidance_scale,
+            cfg_rescale=guidance_rescale,
+            backbone_input={
+                "x_mask": latent_mask,
+                "context": context,
+                "context_mask": context_mask,
+                "time_aligned_context": time_aligned_content,
+            }
+        )
+        waveform = self.autoencoder.decode(latent,latent_mask)
+        return waveform

models/dit/__init__.py ADDED Viewed

File without changes

models/dit/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (143 Bytes). View file

models/dit/__pycache__/mmdit_back.cpython-310.pyc ADDED Viewed

Binary file (8.63 kB). View file

models/dit/__pycache__/mmdit_layers.cpython-310.pyc ADDED Viewed

Binary file (11.5 kB). View file

models/dit/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (14 kB). View file

models/dit/attention.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .rotary import RotaryEmbedding
+from .modules import RMSNorm
+if hasattr(nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+def add_mask(sim, mask):
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+def create_mask(q_shape, k_shape, device, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+    b, i, j, device = q_shape[0], q_shape[-2], k_shape[-2], device
+    q_mask = default(
+        q_mask, torch.ones((b, i), device=device, dtype=torch.bool)
+    )
+    k_mask = default(
+        k_mask, torch.ones((b, j), device=device, dtype=torch.bool)
+    )
+    k_mask = k_mask.to(device)
+    q_mask = q_mask.to(device)
+    attn_mask = rearrange(q_mask, 'b i -> b 1 i 1'
+                         ) * rearrange(k_mask, 'b j -> b 1 1 j')
+    return attn_mask
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        context_dim=None,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        rope_mode='none'
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        if context_dim is None:
+            self.cross_attn = False
+        else:
+            self.cross_attn = True
+        context_dim = dim if context_dim is None else context_dim
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias)
+        if qk_norm is None:
+            self.norm_q = nn.Identity()
+            self.norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            self.norm_q = nn.LayerNorm(head_dim)
+            self.norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            self.norm_q = RMSNorm(head_dim)
+            self.norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        if self.cross_attn:
+            assert rope_mode == 'none'
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = self.rotary_c(
+                q=q[:, :, :extras, :], k=k[:, :, :extras, :]
+            )
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary
+            )
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(
+                attn, mask_binary
+            ) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def forward(self, x, context=None, context_mask=None, extras=0):
+        B, L, C = x.shape
+        if context is None:
+            context = x
+        q = self.to_q(x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if context_mask is not None:
+            mask_binary = create_mask(
+                x.shape, context.shape, x.device, None, context_mask
+            )
+        else:
+            mask_binary = None
+        q = einops.rearrange(q, 'B L (H D) -> B H L D', H=self.num_heads)
+        k = einops.rearrange(k, 'B L (H D) -> B H L D', H=self.num_heads)
+        v = einops.rearrange(v, 'B L (H D) -> B H L D', H=self.num_heads)
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class JointAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        rope_mode='none'
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.to_qx, self.to_kx, self.to_vx = self._make_qkv_layers(
+            dim, qkv_bias
+        )
+        self.to_qc, self.to_kc, self.to_vc = self._make_qkv_layers(
+            dim, qkv_bias
+        )
+        self.norm_qx, self.norm_kx = self._make_norm_layers(qk_norm, head_dim)
+        self.norm_qc, self.norm_kc = self._make_norm_layers(qk_norm, head_dim)
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_x = nn.Linear(dim, dim)
+        self.proj_drop_x = nn.Dropout(proj_drop)
+        self.proj_c = nn.Linear(dim, dim)
+        self.proj_drop_c = nn.Dropout(proj_drop)
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _make_qkv_layers(self, dim, qkv_bias):
+        return (
+            nn.Linear(dim, dim,
+                      bias=qkv_bias), nn.Linear(dim, dim, bias=qkv_bias),
+            nn.Linear(dim, dim, bias=qkv_bias)
+        )
+    def _make_norm_layers(self, qk_norm, head_dim):
+        if qk_norm is None:
+            norm_q = nn.Identity()
+            norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            norm_q = nn.LayerNorm(head_dim)
+            norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            norm_q = RMSNorm(head_dim)
+            norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        return norm_q, norm_k
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = self.rotary_c(
+                q=q[:, :, :extras, :], k=k[:, :, :extras, :]
+            )
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary
+            )
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(
+                attn, mask_binary
+            ) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def _cat_mask(self, x, context, x_mask=None, context_mask=None):
+        B = x.shape[0]
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(
+                B, context.shape[-2], device=context.device
+            ).bool()
+        mask = torch.cat([context_mask, x_mask], dim=1)
+        return mask
+    def forward(self, x, context, x_mask=None, context_mask=None, extras=0):
+        B, Lx, C = x.shape
+        _, Lc, _ = context.shape
+        if x_mask is not None or context_mask is not None:
+            mask = self._cat_mask(
+                x, context, x_mask=x_mask, context_mask=context_mask
+            )
+            shape = [B, Lx + Lc, C]
+            mask_binary = create_mask(
+                q_shape=shape,
+                k_shape=shape,
+                device=x.device,
+                q_mask=None,
+                k_mask=mask
+            )
+        else:
+            mask_binary = None
+        qx, kx, vx = self.to_qx(x), self.to_kx(x), self.to_vx(x)
+        qc, kc, vc = self.to_qc(context), self.to_kc(context
+                                                    ), self.to_vc(context)
+        qx, kx, vx = map(
+            lambda t: einops.
+            rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads),
+            [qx, kx, vx]
+        )
+        qc, kc, vc = map(
+            lambda t: einops.
+            rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads),
+            [qc, kc, vc]
+        )
+        qx, kx = self.norm_qx(qx), self.norm_kx(kx)
+        qc, kc = self.norm_qc(qc), self.norm_kc(kc)
+        q, k, v = (
+            torch.cat([qc, qx],
+                      dim=2), torch.cat([kc, kx],
+                                        dim=2), torch.cat([vc, vx], dim=2)
+        )
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        context, x = x[:, :Lc, :], x[:, Lc:, :]
+        x = self.proj_x(x)
+        x = self.proj_drop_x(x)
+        context = self.proj_c(context)
+        context = self.proj_drop_c(context)
+        return x, context

models/dit/mmdit_back.py ADDED Viewed

	@@ -0,0 +1,346 @@

+import logging
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# 假设这些是你原来的导入
+from .mmdit_layers import compute_rope_rotations
+from .mmdit_layers import TimestepEmbedder
+from .mmdit_layers import MLP, ChannelLastConv1d, ConvMLP
+from .mmdit_layers import (FinalBlock, MMDitSingleBlock, JointBlock_AT)
+log = logging.getLogger()
+@dataclass
+class PreprocessedConditions:
+    text_f: torch.Tensor
+    text_f_c: torch.Tensor
+class MMAudio(nn.Module):
+    """
+    一个修改版的 MMAudio 接口尽量和LayerFusionAudioDiT一致。
+    """
+    def __init__(self,
+                 *,
+                 latent_dim: int,
+                 text_dim: int,
+                 hidden_dim: int,
+                 depth: int,
+                 fused_depth: int,
+                 num_heads: int,
+                 mlp_ratio: float = 4.0,
+                 latent_seq_len: int,
+                 text_seq_len: int = 640,
+                 # --- 新增参数，对齐 LayerFusionAudioDiT ---
+                 ta_context_dim: int,
+                 ta_context_fusion: str = 'add', # 'add' or 'concat'
+                 ta_context_norm: bool = False,
+                 # --- 其他原有参数 ---
+                 empty_string_feat: Optional[torch.Tensor] = None,
+                 v2: bool = False) -> None:
+        super().__init__()
+        self.v2 = v2
+        self.latent_dim = latent_dim
+        self._latent_seq_len = latent_seq_len
+        self._text_seq_len = text_seq_len
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        # --- 1. time_aligned_context 的投影层 ---
+        # 我们在这里定义一个投影层，而不是在每个 block 里都定义一个。
+        # 这样更高效，也符合你代码注释中的想法：“现在是每一层proj，改为不映射”。
+        # 我们的方案是：只映射一次，然后传递给所有层。
+        self.ta_context_fusion = ta_context_fusion
+        self.ta_context_norm_flag = ta_context_norm
+        if self.ta_context_fusion == "add":
+            # 如果是相加融合，将 ta_context 投射到和 latent 一样的维度 (hidden_dim)
+            self.ta_context_projection = nn.Linear(ta_context_dim, hidden_dim, bias=False)
+            self.ta_context_norm = nn.LayerNorm(ta_context_dim) if self.ta_context_norm_flag else nn.Identity()
+        elif self.ta_context_fusion == "concat":
+            # 如果是拼接融合，在 block 内部处理，这里不需要主投影层
+            # 但你的原始代码在concat后也有一个projection，我们可以在 block 内部实现
+            # 为了简化，这里先假设主要的融合逻辑在 block 内部
+            self.ta_context_projection = nn.Identity()
+            self.ta_context_norm = nn.Identity()
+        else:
+            raise ValueError(f"Unknown ta_context_fusion type: {ta_context_fusion}")
+        # --- 原有的输入投影层 (基本不变) ---
+        # 现在我的输入要变为editing，需要变为latent*2
+        self.audio_input_proj = nn.Sequential(
+            ChannelLastConv1d(latent_dim*2, hidden_dim, kernel_size=7, padding=3),
+            nn.SELU(),
+            ConvMLP(hidden_dim, hidden_dim * 4, kernel_size=7, padding=3),
+        )
+        self.text_input_proj = nn.Sequential(
+            nn.Linear(text_dim, hidden_dim),
+            MLP(hidden_dim, hidden_dim * 4),
+        )
+        self.text_cond_proj = nn.Linear(hidden_dim, hidden_dim)
+        self.global_cond_mlp = MLP(hidden_dim, hidden_dim * 4)
+        #
+        self.t_embed = TimestepEmbedder(hidden_dim, frequency_embedding_size=256, max_period=10000)
+        # --- Transformer Blocks (基本不变) ---
+        # **重要**: 你需要修改 JointBlock_AT 和 MMDitSingleBlock 的 forward 定义来接收 `time_aligned_context`
+        self.joint_blocks = nn.ModuleList([
+            JointBlock_AT(hidden_dim, num_heads, mlp_ratio=mlp_ratio, pre_only=(i == depth - fused_depth - 1))
+            for i in range(depth - fused_depth)
+        ])
+        self.fused_blocks = nn.ModuleList([
+            MMDitSingleBlock(hidden_dim, num_heads, mlp_ratio=mlp_ratio, kernel_size=3, padding=1)
+            for i in range(fused_depth)
+        ])
+        # --- 输出层 (不变) ---
+        self.final_layer = FinalBlock(hidden_dim, latent_dim)
+        if empty_string_feat is None:
+            empty_string_feat = torch.zeros((text_seq_len, text_dim))
+        self.empty_string_feat = nn.Parameter(empty_string_feat, requires_grad=False)
+        self.initialize_weights()
+        self.initialize_rotations()
+    def initialize_rotations(self):
+        base_freq = 1.0
+        # 唯一需要用到长度的
+        latent_rot = compute_rope_rotations(self._latent_seq_len,
+                                            self.hidden_dim // self.num_heads,
+                                            10000,
+                                            freq_scaling=base_freq,
+                                            device="cuda" if torch.cuda.is_available() else "cpu")
+        # add to model buffers
+        self.register_buffer('latent_rot', latent_rot, persistent=False)
+        # self.clip_rot = nn.Buffer(clip_rot, persistent=False)
+    def update_seq_lengths(self, latent_seq_len: int, clip_seq_len: int, sync_seq_len: int) -> None:
+        self._latent_seq_len = latent_seq_len
+        self._clip_seq_len = clip_seq_len
+        self._sync_seq_len = sync_seq_len
+        self.initialize_rotations()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embed.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embed.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:兼容性保护
+        for block in self.joint_blocks:
+            nn.init.constant_(block.latent_block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.latent_block.adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(block.text_block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.text_block.adaLN_modulation[-1].bias, 0)
+        for block in self.fused_blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.conv.weight, 0)
+        nn.init.constant_(self.final_layer.conv.bias, 0)
+    def preprocess_conditions(self, text_f: torch.Tensor) -> PreprocessedConditions:
+        # 预处理文本条件
+        # assert text_f.shape[1] == self._text_seq_len, f'{text_f.shape=} {self._text_seq_len=}'
+        bs = text_f.shape[0]
+        # 这里固定外部的llm_embedding
+        text_f = self.text_input_proj(text_f)
+        # 全局的条件
+        text_f_c = self.text_cond_proj(text_f.mean(dim=1))
+        return PreprocessedConditions(text_f=text_f, text_f_c=text_f_c)
+    def predict_flow(self, x: torch.Tensor, timesteps: torch.Tensor,
+                     conditions: PreprocessedConditions,
+                     time_aligned_context: torch.Tensor) -> torch.Tensor:
+        """
+        核心的预测流程，现在加入了 time_aligned_context。
+        """
+        assert x.shape[2] == self._latent_seq_len, f'{x.shape=} {self._latent_seq_len=}'
+        # 1. 预处理各种输入
+        text_f = conditions.text_f
+        text_f_c = conditions.text_f_c
+        timesteps = timesteps.to(x.dtype)  # 保持和输入张量同 dtype
+        global_c = self.global_cond_mlp(text_f_c)  # (B, D)
+        # 2. 融合 timestep
+        global_c = self.t_embed(timesteps).unsqueeze(1) + global_c.unsqueeze(1) # (B, 1, D)
+        extended_c = global_c # 这个将作为 AdaLN 的条件
+        """
+        这里决定了x的形状,需要debug
+        """
+        # 3. **处理 time_aligned_context** 这里第一种方式是直接和latent进行融合，然后投影
+        # 从128->256
+        x = torch.cat([x.transpose(1, 2), time_aligned_context], dim=-1)
+        latent = self.audio_input_proj(x)  # (B, N, D)
+        # 4. 依次通过 Transformer Blocks
+        for block in self.joint_blocks:
+            # **你需要修改 JointBlock_AT.forward**
+            latent, text_f = block(latent, text_f, global_c, extended_c,
+                                           self.latent_rot)
+        for block in self.fused_blocks:
+            # **你需要修改 MMDitSingleBlock.forward**
+            latent = block(latent, extended_c, self.latent_rot)
+        # 5. 通过输出层
+        flow = self.final_layer(latent, global_c)
+        return flow
+    def forward(self,
+                x: torch.Tensor,
+                timesteps: torch.Tensor,
+                context: torch.Tensor,
+                time_aligned_context: torch.Tensor,
+                x_mask=None,
+                context_mask=None,
+               ) -> torch.Tensor:
+        """
+        模型主入口，接口已对齐 LayerFusionAudioDiT。
+        - x: 噪声 latent, shape (B, N_latent, latent_dim)
+        - timesteps: 时间步, shape (B,)
+        - context: 文本条件, shape (B, N_text, text_dim)
+        - time_aligned_context: 时间对齐的条件, shape (B, N_ta, ta_context_dim)
+        """
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]).to(x.device, dtype=torch.long)
+        text_conditions = self.preprocess_conditions(context)
+        # 调用核心预测流
+        flow = self.predict_flow(x, timesteps, text_conditions, time_aligned_context)
+        flow = flow.transpose(1, 2)
+        return flow
+    @property
+    def latent_seq_len(self) -> int:
+        return self._latent_seq_len
+# latent(b,500，128)
+def small_16k(**kwargs) -> MMAudio:
+    num_heads = 16
+    return MMAudio(latent_dim=128,
+                   text_dim=1024,
+                   hidden_dim=64 * num_heads,
+                   depth=12,
+                   fused_depth=8,
+                   num_heads=num_heads,
+                   latent_seq_len=500,
+                   **kwargs)
+if __name__ == '__main__':
+    batch_size = 4
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    config = {
+        "ta_context_dim": 128,
+        "ta_context_fusion": "concat",
+        "ta_context_norm": False
+    }
+    try:
+        model = small_16k(**config).to(device)
+        model.eval() # 使用评估模式
+        print("Model instantiated successfully!")
+    except Exception as e:
+        print(f"Error during model instantiation: {e}")
+        exit()
+    num_params = sum(p.numel() for p in model.parameters()) / 1e6
+    print(f'Number of parameters: {num_params:.2f}M')
+    latent_dim = 128
+    latent_seq_len = 500
+    text_dim = 1024
+    #
+    text_seq_len = 640
+    ta_context_dim = config["ta_context_dim"]
+    dummy_x = torch.randn(batch_size,latent_dim, latent_seq_len, device=device)
+    dummy_timesteps = torch.randint(0, 1000, (batch_size,), device=device)
+    dummy_context = torch.randn(batch_size, text_seq_len, text_dim, device=device)
+    # 这里的 time_aligned_context 形状需要和 x 一致，以便在特征维度上拼接
+    dummy_ta_context = torch.randn(batch_size, latent_seq_len, ta_context_dim, device=device)
+    print("\n--- Input Shapes ---")
+    print(f"x (latent):           {dummy_x.shape}")
+    print(f"timesteps:            {dummy_timesteps.shape}")
+    print(f"context (text):       {dummy_context.shape}")
+    print(f"time_aligned_context: {dummy_ta_context.shape}")
+    print("--------------------\n")
+    # 4. 执行前向传播
+    try:
+        with torch.no_grad(): # 在验证时不需要计算梯度
+            output = model(
+                x=dummy_x,
+                timesteps=dummy_timesteps,
+                context=dummy_context,
+                time_aligned_context=dummy_ta_context
+            )
+        print("✅ Forward pass successful!")
+        print(f"Output shape: {output.shape}")
+        # 5. 验证输出形状
+        expected_shape = (batch_size, latent_seq_len, latent_dim)
+        assert output.shape == expected_shape, \
+            f"Output shape mismatch! Expected {expected_shape}, but got {output.shape}"
+        print("✅ Output shape is correct!")
+    except Exception as e:
+        print(f"❌ Error during forward pass: {e}")
+        import traceback
+        traceback.print_exc()

models/dit/mmdit_layers.py ADDED Viewed

	@@ -0,0 +1,421 @@

+from typing import Optional
+from typing import Union
+import torch
+from einops import rearrange
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+import torch
+from torch import nn
+from torch.nn import functional as F
+from .modules import RMSNorm
+# https://github.com/facebookresearch/DiT
+# Ref: https://github.com/black-forest-labs/flux/blob/main/src/flux/math.py
+# Ref: https://github.com/lucidrains/rotary-embedding-torch
+def compute_rope_rotations(length: int,
+                           dim: int,
+                           theta: int,
+                           *,
+                           freq_scaling: float = 1.0,
+                           device: Union[torch.device, str] = 'cpu') -> Tensor:
+    assert dim % 2 == 0
+    with torch.amp.autocast(device_type='cuda', enabled=False):
+        pos = torch.arange(length, dtype=torch.float32, device=device)
+        freqs = 1.0 / (theta**(torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
+        freqs *= freq_scaling
+        rot = torch.einsum('..., f -> ... f', pos, freqs)
+        rot = torch.stack([torch.cos(rot), -torch.sin(rot), torch.sin(rot), torch.cos(rot)], dim=-1)
+        rot = rearrange(rot, 'n d (i j) -> 1 n d i j', i=2, j=2)
+        return rot
+def apply_rope(x: Tensor, rot: Tensor) -> tuple[Tensor, Tensor]:
+    with torch.amp.autocast(device_type='cuda', enabled=False):
+        _x = x.float()
+        _x = _x.view(*_x.shape[:-1], -1, 1, 2)
+        x_out = rot[..., 0] * _x[..., 0] + rot[..., 1] * _x[..., 1]
+        return x_out.reshape(*x.shape).to(dtype=x.dtype)
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, dim, frequency_embedding_size, max_period):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim),
+        )
+        self.dim = dim
+        self.max_period = max_period
+        assert dim % 2 == 0, 'dim must be even.'
+        with torch.autocast('cuda', enabled=False):
+            # 1. 先计算出最终的张量
+            initial_freqs = 1.0 / (10000**(torch.arange(0, frequency_embedding_size, 2, dtype=torch.float32) /
+                                           frequency_embedding_size))
+            freq_scale = 10000 / max_period
+            freqs_tensor = freq_scale * initial_freqs
+            # 2. 使用 register_buffer() 将最终的张量注册为 buffer
+            self.register_buffer('freqs', freqs_tensor, persistent=False)
+    def timestep_embedding(self, t):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        args = t[:, None].float() * self.freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t).to(t.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class ChannelLastConv1d(nn.Conv1d):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.permute(0, 2, 1)
+        x = super().forward(x)
+        x = x.permute(0, 2, 1)
+        return x
+# https://github.com/Stability-AI/sd3-ref
+class MLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int = 256,
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        Attributes:
+            w1 (ColumnParallelLinear): Linear transformation for the first layer.
+            w2 (RowParallelLinear): Linear transformation for the second layer.
+            w3 (ColumnParallelLinear): Linear transformation for the third layer.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class ConvMLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int = 256,
+        kernel_size: int = 3,
+        padding: int = 1,
+    ):
+        """
+        Initialize the FeedForward module.
+        Args:
+            dim (int): Input dimension.
+            hidden_dim (int): Hidden dimension of the feedforward layer.
+            multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        Attributes:
+            w1 (ColumnParallelLinear): Linear transformation for the first layer.
+            w2 (RowParallelLinear): Linear transformation for the second layer.
+            w3 (ColumnParallelLinear): Linear transformation for the third layer.
+        """
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = ChannelLastConv1d(dim,
+                                    hidden_dim,
+                                    bias=False,
+                                    kernel_size=kernel_size,
+                                    padding=padding)
+        self.w2 = ChannelLastConv1d(hidden_dim,
+                                    dim,
+                                    bias=False,
+                                    kernel_size=kernel_size,
+                                    padding=padding)
+        self.w3 = ChannelLastConv1d(dim,
+                                    hidden_dim,
+                                    bias=False,
+                                    kernel_size=kernel_size,
+                                    padding=padding)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor):
+    return x * (1 + scale) + shift
+def attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
+    # training will crash without these contiguous calls and the CUDNN limitation
+    # I believe this is related to https://github.com/pytorch/pytorch/issues/133974
+    # unresolved at the time of writing
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    out = F.scaled_dot_product_attention(q, k, v)
+    out = rearrange(out, 'b h n d -> b n (h d)').contiguous()
+    return out
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, nheads: int):
+        super().__init__()
+        self.dim = dim
+        self.nheads = nheads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.q_norm = RMSNorm(dim // nheads)
+        self.k_norm = RMSNorm(dim // nheads)
+        self.split_into_heads = Rearrange('b n (h d j) -> b h n d j',
+                                          h=nheads,
+                                          d=dim // nheads,
+                                          j=3)
+    def pre_attention(
+            self, x: torch.Tensor,
+            rot: Optional[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        # x: batch_size * n_tokens * n_channels
+        qkv = self.qkv(x)
+        q, k, v = self.split_into_heads(qkv).chunk(3, dim=-1)
+        q = q.squeeze(-1)
+        k = k.squeeze(-1)
+        v = v.squeeze(-1)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if rot is not None:
+            q = apply_rope(q, rot)
+            k = apply_rope(k, rot)
+        return q, k, v
+    def forward(
+            self,
+            x: torch.Tensor,  # batch_size * n_tokens * n_channels
+    ) -> torch.Tensor:
+        q, k, v = self.pre_attention(x)
+        out = attention(q, k, v)
+        return out
+class MMDitSingleBlock(nn.Module):
+    def __init__(self,
+                 dim: int,
+                 nhead: int,
+                 mlp_ratio: float = 4.0,
+                 pre_only: bool = False,
+                 kernel_size: int = 7,
+                 padding: int = 3):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=False)
+        self.attn = SelfAttention(dim, nhead)
+        self.pre_only = pre_only
+        if pre_only:
+            self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(dim, 2 * dim, bias=True))
+        else:
+            if kernel_size == 1:
+                self.linear1 = nn.Linear(dim, dim)
+            else:
+                self.linear1 = ChannelLastConv1d(dim, dim, kernel_size=kernel_size, padding=padding)
+            self.norm2 = nn.LayerNorm(dim, elementwise_affine=False)
+            if kernel_size == 1:
+                self.ffn = MLP(dim, int(dim * mlp_ratio))
+            else:
+                self.ffn = ConvMLP(dim,
+                                   int(dim * mlp_ratio),
+                                   kernel_size=kernel_size,
+                                   padding=padding)
+            self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(dim, 6 * dim, bias=True))
+    def pre_attention(self, x: torch.Tensor, c: torch.Tensor, rot: Optional[torch.Tensor]):
+        # x: BS * N * D
+        # cond: BS * D
+        modulation = self.adaLN_modulation(c)
+        if self.pre_only:
+            (shift_msa, scale_msa) = modulation.chunk(2, dim=-1)
+            gate_msa = shift_mlp = scale_mlp = gate_mlp = None
+        else:
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = modulation.chunk(6, dim=-1)
+        x = modulate(self.norm1(x), shift_msa, scale_msa)
+        q, k, v = self.attn.pre_attention(x, rot)
+        return (q, k, v), (gate_msa, shift_mlp, scale_mlp, gate_mlp)
+    def post_attention(self, x: torch.Tensor, attn_out: torch.Tensor, c: tuple[torch.Tensor]):
+        if self.pre_only:
+            return x
+        (gate_msa, shift_mlp, scale_mlp, gate_mlp) = c
+        x = x + self.linear1(attn_out) * gate_msa
+        r = modulate(self.norm2(x), shift_mlp, scale_mlp)
+        x = x + self.ffn(r) * gate_mlp
+        return x
+    # 这里的forward似乎没有用到
+    def forward(self, x: torch.Tensor, cond: torch.Tensor,
+                rot: Optional[torch.Tensor]) -> torch.Tensor:
+        # x: BS * N * D
+        # cond: BS * D
+        x_qkv, x_conditions = self.pre_attention(x, cond, rot)
+        attn_out = attention(*x_qkv)
+        x = self.post_attention(x, attn_out, x_conditions)
+        return x
+class JointBlock_AT(nn.Module):
+    """
+    Audio + Text only JointBlock（去掉 clip 分支）
+    返回 (latent, text_f)
+    """
+    def __init__(self, dim: int, nhead: int, mlp_ratio: float = 4.0, pre_only: bool = False):
+        super().__init__()
+        self.pre_only = pre_only
+        self.latent_block = MMDitSingleBlock(dim,
+                                             nhead,
+                                             mlp_ratio,
+                                             pre_only=False,
+                                             kernel_size=3,
+                                             padding=1)
+        # text_block 仍保留 pre_only 参数（可能是 pre-only 的 AdaLN）
+        self.text_block = MMDitSingleBlock(dim, nhead, mlp_ratio, pre_only=pre_only, kernel_size=1)
+    def forward(self, latent: torch.Tensor, text_f: torch.Tensor,
+                global_c: torch.Tensor, extended_c: torch.Tensor, latent_rot: Optional[torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]:
+        # latent: (B, N_latent, D)
+        # text_f: (B, N_text, D)
+        # global_c: (B, 1, D) or (B, D)
+        # extended_c: (B, N_latent, D) or (B, 1, D)
+        x_qkv, x_mod = self.latent_block.pre_attention(latent, extended_c, latent_rot)
+        # text没有做rope编码, 也有点奇怪，可能audiollm中带有
+        t_qkv, t_mod = self.text_block.pre_attention(text_f, global_c, rot=None)
+        latent_len = latent.shape[1]
+        text_len = text_f.shape[1]
+        # 只拼接 latent + text
+        joint_qkv = [torch.cat([x_qkv[i], t_qkv[i]], dim=2) for i in range(3)]  # dim=2=token dim
+        attn_out = attention(*joint_qkv)  # (B, latent_len + text_len, D)
+        x_attn_out = attn_out[:, :latent_len]      # (B, latent_len, D)
+        t_attn_out = attn_out[:, latent_len:]      # (B, text_len, D)
+        latent = self.latent_block.post_attention(latent, x_attn_out, x_mod)
+        if not self.pre_only:
+            text_f = self.text_block.post_attention(text_f, t_attn_out, t_mod)
+        return latent, text_f
+    # 改一下mask的逻辑
+    # def forward(self, latent, text_f, global_c, extended_c, latent_rot,
+    #         latent_mask: torch.Tensor, text_mask: torch.Tensor):
+    #     # latent_mask: (B, N_latent) {0,1}
+    #     # text_mask:   (B, N_text)   {0,1}
+    #     x_qkv, x_mod = self.latent_block.pre_attention(latent, extended_c, latent_rot)
+    #     t_qkv, t_mod = self.text_block.pre_attention(text_f, global_c, rot=None)
+    #     latent_len = latent.shape[1]
+    #     text_len   = text_f.shape[1]
+    #     # 1) 拼 qkv
+    #     joint_qkv = [torch.cat([x_qkv[i], t_qkv[i]], dim=2) for i in range(3)]  # 这里假设 token 维=2
+    #     # 2) 构造 key mask（拼接后的）
+    #     key_mask = torch.cat([latent_mask, text_mask], dim=1).bool()   # (B, N_total)
+    #     # 3) 调用注意力（要求 attention 支持 key_mask）
+    #     # 若你的 attention 不支持，需要自己在里面对 logits 做 -inf 掩码；示例见后
+    #     attn_out = attention(*joint_qkv, key_mask=key_mask)  # (B, N_total, D)
+    #     # 4) 切回两段
+    #     x_attn_out = attn_out[:, :latent_len, :]
+    #     t_attn_out = attn_out[:, latent_len:, :]
+    #     # 5) 对 query 端输出做屏蔽（避免 padding query 写回）
+    #     x_attn_out = x_attn_out * latent_mask.unsqueeze(-1)  # (B, N_latent, D)
+    #     t_attn_out = t_attn_out * text_mask.unsqueeze(-1)    # (B, N_text,  D)
+    #     # 6) post_attention 内部**还要**用 query mask 把残差和 FFN 的更新再屏蔽一次（见下一节）
+    #     latent = self.latent_block.post_attention(latent, x_attn_out, x_mod,
+    #                                             query_mask=latent_mask)
+    #     if not self.text_block.pre_only:
+    #         text_f = self.text_block.post_attention(text_f, t_attn_out, t_mod,
+    #                                                 query_mask=text_mask)
+    #     return latent, text_f
+class FinalBlock(nn.Module):
+    def __init__(self, dim, out_dim):
+        super().__init__()
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(dim, 2 * dim, bias=True))
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False)
+        self.conv = ChannelLastConv1d(dim, out_dim, kernel_size=7, padding=3)
+    def forward(self, latent, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        latent = modulate(self.norm(latent), shift, scale)
+        latent = self.conv(latent)
+        return latent

models/dit/modules.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.cuda.amp import autocast
+import math
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+def trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2
+        )
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+# disable in checkpoint mode
+# @torch.jit.script
+def film_modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) *
+        torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding,
+                               torch.zeros_like(embedding[:, :1])],
+                              dim=-1)
+    return embedding
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(
+        self, hidden_size, frequency_embedding_size=256, out_size=None
+    ):
+        super().__init__()
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, out_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(
+            self.mlp[0].weight.dtype
+        )
+        t_emb = self.mlp(t_freq)
+        return t_emb
+def patchify(imgs, patch_size, input_type='2d'):
+    if input_type == '2d':
+        x = einops.rearrange(
+            imgs,
+            'B C (h p1) (w p2) -> B (h w) (p1 p2 C)',
+            p1=patch_size,
+            p2=patch_size
+        )
+    elif input_type == '1d':
+        x = einops.rearrange(imgs, 'B C (h p1) -> B h (p1 C)', p1=patch_size)
+    return x
+def unpatchify(x, channels=3, input_type='2d', img_size=None):
+    if input_type == '2d':
+        patch_size = int((x.shape[2] // channels)**0.5)
+        # h = w = int(x.shape[1] ** .5)
+        h, w = img_size[0] // patch_size, img_size[1] // patch_size
+        assert h * w == x.shape[1] and patch_size**2 * channels == x.shape[2]
+        x = einops.rearrange(
+            x,
+            'B (h w) (p1 p2 C) -> B C (h p1) (w p2)',
+            h=h,
+            p1=patch_size,
+            p2=patch_size
+        )
+    elif input_type == '1d':
+        patch_size = int((x.shape[2] // channels))
+        h = x.shape[1]
+        assert patch_size * channels == x.shape[2]
+        x = einops.rearrange(x, 'B h (p1 C) -> B C (h p1)', h=h, p1=patch_size)
+    return x
+class PatchEmbed(nn.Module):
+    """
+     Image to Patch Embedding
+    """
+    def __init__(self, patch_size, in_chans=3, embed_dim=768, input_type='2d'):
+        super().__init__()
+        self.patch_size = patch_size
+        self.input_type = input_type
+        if input_type == '2d':
+            self.proj = nn.Conv2d(
+                in_chans,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_size,
+                bias=True
+            )
+        elif input_type == '1d':
+            self.proj = nn.Conv1d(
+                in_chans,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_size,
+                bias=True
+            )
+    def forward(self, x):
+        if self.input_type == '2d':
+            B, C, H, W = x.shape
+            assert H % self.patch_size == 0 and W % self.patch_size == 0
+        elif self.input_type == '1d':
+            B, C, H = x.shape
+            assert H % self.patch_size == 0
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class PositionalConvEmbedding(nn.Module):
+    """
+    Convolutional positional embedding used in F5-TTS.
+    """
+    def __init__(self, dim=768, kernel_size=31, groups=16):
+        super().__init__()
+        assert kernel_size % 2 != 0
+        self.conv1d = nn.Sequential(
+            nn.Conv1d(
+                dim, dim, kernel_size, groups=groups, padding=kernel_size // 2
+            ),
+            nn.Mish(),
+            nn.Conv1d(
+                dim, dim, kernel_size, groups=groups, padding=kernel_size // 2
+            ),
+            nn.Mish(),
+        )
+    def forward(self, x):
+        # B T C
+        x = self.conv1d(x.transpose(1, 2))
+        x = x.transpose(1, 2)
+        return x
+class SinusoidalPositionalEncoding(nn.Module):
+    def __init__(self, dim, length):
+        super(SinusoidalPositionalEncoding, self).__init__()
+        self.length = length
+        self.dim = dim
+        self.register_buffer(
+            'pe', self._generate_positional_encoding(length, dim)
+        )
+    def _generate_positional_encoding(self, length, dim):
+        pe = torch.zeros(length, dim)
+        position = torch.arange(0, length, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        return pe
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return x
+class PE_wrapper(nn.Module):
+    def __init__(self, dim=768, method='abs', length=None, **kwargs):
+        super().__init__()
+        self.method = method
+        if method == 'abs':
+            # init absolute pe like UViT
+            self.length = length
+            self.abs_pe = nn.Parameter(torch.zeros(1, length, dim))
+            trunc_normal_(self.abs_pe, mean=0.0, std=.02, a=-.04, b=.04)
+        elif method == 'conv':
+            self.conv_pe = PositionalConvEmbedding(dim=dim, **kwargs)
+        elif method == 'sinu':
+            self.sinu_pe = SinusoidalPositionalEncoding(dim=dim, length=length)
+        elif method == 'none':
+            # skip pe
+            self.id = nn.Identity()
+        else:
+            raise NotImplementedError
+    def forward(self, x):
+        if self.method == 'abs':
+            _, L, _ = x.shape
+            assert L <= self.length
+            x = x + self.abs_pe[:, :L, :]
+        elif self.method == 'conv':
+            x = x + self.conv_pe(x)
+        elif self.method == 'sinu':
+            x = self.sinu_pe(x)
+        elif self.method == 'none':
+            x = self.id(x)
+        else:
+            raise NotImplementedError
+        return x
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The normalized tensor.
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+        Args:
+            x (torch.Tensor): The input tensor.
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class GELU(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        approximate: str = "none",
+        bias: bool = True
+    ):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(
+            gate.to(dtype=torch.float32), approximate=self.approximate
+        ).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class GEGLU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+# disable in checkpoint mode
+# @torch.jit.script
+def snake_beta(x, alpha, beta):
+    return x + beta * torch.sin(x * alpha).pow(2)
+class Snake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias, alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+    def forward(self, x):
+        x = self.proj(x)
+        x = snake_beta(x, self.alpha, self.beta)
+        return x
+class GESnake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias, alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+    def forward(self, x):
+        x = self.proj(x)
+        x, gate = x.chunk(2, dim=-1)
+        return x * snake_beta(gate, self.alpha, self.beta)
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out=None,
+        mult=4,
+        dropout=0.0,
+        activation_fn="geglu",
+        final_dropout=False,
+        inner_dim=None,
+        bias=True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "snake":
+            act_fn = Snake(dim, inner_dim, bias=bias)
+        elif activation_fn == "gesnake":
+            act_fn = GESnake(dim, inner_dim, bias=bias)
+        else:
+            raise NotImplementedError
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states

models/dit/rotary.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+"this rope is faster than llama rope with jit script"
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+# disable in checkpoint mode
+# @torch.jit.script
+def apply_rotary_pos_emb(x, cos, sin):
+    # NOTE: This could probably be moved to Triton
+    # Handle a possible sequence length mismatch in between q and k
+    cos = cos[:, :, :x.shape[-2], :]
+    sin = sin[:, :, :x.shape[-2], :]
+    return (x*cos) + (rotate_half(x) * sin)
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    """
+    def __init__(self, dim: int):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000**(torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+    def _update_cos_sin_tables(self, x, seq_dimension=-2):
+        # expect input: B, H, L, D
+        seq_len = x.shape[seq_dimension]
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        # also make sure dtype wont change
+        if (
+            seq_len != self._seq_len_cached or
+            self._cos_cached.device != x.device or
+            self._cos_cached.dtype != x.dtype
+        ):
+            self._seq_len_cached = seq_len
+            t = torch.arange(
+                x.shape[seq_dimension], device=x.device, dtype=torch.float32
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype))
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
+            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)
+        return self._cos_cached, self._sin_cached
+    def forward(self, q, k):
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
+            q.float(), seq_dimension=-2
+        )
+        if k is not None:
+            return (
+                apply_rotary_pos_emb(
+                    q.float(), self._cos_cached, self._sin_cached
+                ).type_as(q),
+                apply_rotary_pos_emb(
+                    k.float(), self._cos_cached, self._sin_cached
+                ).type_as(k),
+            )
+        else:
+            return (
+                apply_rotary_pos_emb(
+                    q.float(), self._cos_cached, self._sin_cached
+                ).type_as(q), None
+            )

models/dit/span_mask.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import numpy as np
+import torch
+from typing import Optional, Tuple
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    # Convert mask_prob to a NumPy array
+    mask_prob = np.array(mask_prob)
+    # Calculate all_num_mask for each element in the batch
+    all_num_mask = np.floor(
+        mask_prob * all_sz / float(mask_length) + np.random.rand(bsz)
+    ).astype(int)
+    # Apply the max operation with min_masks for each element
+    all_num_mask = np.maximum(min_masks, all_num_mask)
+    mask_idcs = []
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length) + np.random.rand()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask[i]
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = np.random.randint(
+                mask_other, mask_length*2 + 1, size=num_mask
+            )
+        elif mask_type == "normal":
+            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = np.random.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = np.random.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (
+                        e - s if e - s >= length + min_space else 0
+                        for s, e in parts
+                    ),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = np.random.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+            mask_idc = np.asarray([
+                mask_idc[j] + offset for j in range(len(mask_idc))
+                for offset in range(lengths[j])
+            ])
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+    # min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        # if len(mask_idc) > min_len:
+        # mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+    return torch.tensor(mask)
+if __name__ == '__main__':
+    mask = compute_mask_indices(
+        shape=[4, 500],
+        padding_mask=None,
+        mask_prob=[0.65, 0.5, 0.65, 0.65],
+        mask_length=10,
+        mask_type="static",
+        mask_other=0.0,
+        min_masks=1,
+        no_overlap=False,
+        min_space=0,
+    )
+    print(mask)
+    print(mask.sum(dim=1))

models/flow_matching.py ADDED Viewed

	@@ -0,0 +1,1082 @@

+from typing import Any, Optional, Union, List, Sequence
+import inspect
+import random
+from tqdm import tqdm
+import numpy as np
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.training_utils import compute_density_for_timestep_sampling
+from models.autoencoder.autoencoder_base import AutoEncoderBase
+from models.content_encoder.content_encoder import ContentEncoder
+from models.content_adapter import ContentAdapterBase
+from models.common import LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase
+from utils.torch_utilities import (
+    create_alignment_path, create_mask_from_length, loss_with_mask,
+    trim_or_pad_length
+)
+from constants import SAME_LENGTH_TASKS
+class FlowMatchingMixin:
+    def __init__(
+        self,
+        cfg_drop_ratio: float = 0.2,
+        sample_strategy: str = 'normal',
+        num_train_steps: int = 1000
+    ) -> None:
+        r"""
+        Args:
+            cfg_drop_ratio (float): Dropout ratio for the autoencoder.
+            sample_strategy (str): Sampling strategy for timesteps during training.
+            num_train_steps (int): Number of training steps for the noise scheduler.
+        """
+        self.sample_strategy = sample_strategy
+        self.infer_noise_scheduler = FlowMatchEulerDiscreteScheduler(
+            num_train_timesteps=num_train_steps
+        )
+        self.train_noise_scheduler = copy.deepcopy(self.infer_noise_scheduler)
+        self.classifier_free_guidance = cfg_drop_ratio > 0.0
+        self.cfg_drop_ratio = cfg_drop_ratio
+    def get_input_target_and_timesteps(
+        self,
+        latent: torch.Tensor,
+        training: bool,
+    ):
+        batch_size = latent.shape[0]
+        noise = torch.randn_like(latent)
+        if training:
+            if self.sample_strategy == 'normal':
+                u = compute_density_for_timestep_sampling(
+                    weighting_scheme="logit_normal",
+                    batch_size=batch_size,
+                    logit_mean=0,
+                    logit_std=1,
+                    mode_scale=None,
+                )
+            elif self.sample_strategy == 'uniform':
+                u = torch.rand(batch_size, )
+            else:
+                raise NotImplementedError(
+                    f"{self.sample_strategy} samlping for timesteps is not supported now"
+                )
+            indices = (
+                u * self.train_noise_scheduler.config.num_train_timesteps
+            ).long()
+        else:
+            indices = (
+                self.train_noise_scheduler.config.num_train_timesteps // 2
+            ) * torch.ones((batch_size, )).long()
+        # train_noise_scheduler.timesteps: a list from 1 ~ num_trainsteps with 1 as interval
+        timesteps = self.train_noise_scheduler.timesteps[indices].to(
+            device=latent.device
+        )
+        sigmas = self.get_sigmas(
+            timesteps, n_dim=latent.ndim, dtype=latent.dtype
+        )
+        noisy_latent = (1.0 - sigmas) * latent + sigmas * noise
+        target = noise - latent
+        return noisy_latent, target, timesteps
+    def get_sigmas(self, timesteps, n_dim=3, dtype=torch.float32):
+        device = timesteps.device
+        # a list from 1 declining to 1/num_train_steps
+        sigmas = self.train_noise_scheduler.sigmas.to(
+            device=device, dtype=dtype
+        )
+        schedule_timesteps = self.train_noise_scheduler.timesteps.to(device)
+        timesteps = timesteps.to(device)
+        step_indices = [(schedule_timesteps == t).nonzero().item()
+                        for t in timesteps]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < n_dim:
+            sigma = sigma.unsqueeze(-1)
+        return sigma
+    def retrieve_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        timesteps: Optional[List[int]] = None,
+        sigmas: Optional[List[float]] = None,
+        **kwargs,
+    ):
+        # used in inference, retrieve new timesteps on given inference timesteps
+        scheduler = self.infer_noise_scheduler
+        if timesteps is not None and sigmas is not None:
+            raise ValueError(
+                "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+            )
+        if timesteps is not None:
+            accepts_timesteps = "timesteps" in set(
+                inspect.signature(scheduler.set_timesteps).parameters.keys()
+            )
+            if not accepts_timesteps:
+                raise ValueError(
+                    f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                    f" timestep schedules. Please check whether you are using the correct scheduler."
+                )
+            scheduler.set_timesteps(
+                timesteps=timesteps, device=device, **kwargs
+            )
+            timesteps = scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        elif sigmas is not None:
+            accept_sigmas = "sigmas" in set(
+                inspect.signature(scheduler.set_timesteps).parameters.keys()
+            )
+            if not accept_sigmas:
+                raise ValueError(
+                    f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                    f" sigmas schedules. Please check whether you are using the correct scheduler."
+                )
+            scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+            timesteps = scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            scheduler.set_timesteps(
+                num_inference_steps, device=device, **kwargs
+            )
+            timesteps = scheduler.timesteps
+        return timesteps, num_inference_steps
+class ContentEncoderAdapterMixin:
+    def __init__(
+        self,
+        content_encoder: ContentEncoder,
+        content_adapter: ContentAdapterBase | None = None
+    ):
+        self.content_encoder = content_encoder
+        self.content_adapter = content_adapter
+    def encode_content(
+        self,
+        content: list[Any],
+        task: list[str],
+        device: str | torch.device,
+        instruction: torch.Tensor | None = None,
+        instruction_lengths: torch.Tensor | None = None
+    ):
+        content_output: dict[
+            str, torch.Tensor] = self.content_encoder.encode_content(
+                content, task, device=device
+            )
+        content, content_mask = content_output["content"], content_output[
+            "content_mask"]
+        if instruction is not None:
+            instruction_mask = create_mask_from_length(instruction_lengths)
+            (
+                content,
+                content_mask,
+                global_duration_pred,
+                local_duration_pred,
+            ) = self.content_adapter(
+                content, content_mask, instruction, instruction_mask
+            )
+        return_dict = {
+            "content": content,
+            "content_mask": content_mask,
+            "length_aligned_content": content_output["length_aligned_content"],
+        }
+        if instruction is not None:
+            return_dict["global_duration_pred"] = global_duration_pred
+            return_dict["local_duration_pred"] = local_duration_pred
+        return return_dict
+class SingleTaskCrossAttentionAudioFlowMatching(
+    LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase,
+    FlowMatchingMixin, ContentEncoderAdapterMixin
+):
+    def __init__(
+        self,
+        autoencoder: nn.Module,
+        content_encoder: ContentEncoder,
+        backbone: nn.Module,
+        cfg_drop_ratio: float = 0.2,
+        sample_strategy: str = 'normal',
+        num_train_steps: int = 1000,
+    ):
+        nn.Module.__init__(self)
+        FlowMatchingMixin.__init__(
+            self, cfg_drop_ratio, sample_strategy, num_train_steps
+        )
+        ContentEncoderAdapterMixin.__init__(
+            self, content_encoder=content_encoder
+        )
+        self.autoencoder = autoencoder
+        for param in self.autoencoder.parameters():
+            param.requires_grad = False
+        if hasattr(
+            self.content_encoder, "audio_encoder"
+        ) and self.content_encoder.audio_encoder is not None:
+            self.content_encoder.audio_encoder.model = self.autoencoder
+        self.backbone = backbone
+        self.dummy_param = nn.Parameter(torch.empty(0))
+    def forward(
+        self, content: list[Any], condition: list[Any], task: list[str],
+        waveform: torch.Tensor, waveform_lengths: torch.Tensor, **kwargs
+    ):
+        device = self.dummy_param.device
+        self.autoencoder.eval()
+        with torch.no_grad():
+            latent, latent_mask = self.autoencoder.encode(
+                waveform.unsqueeze(1), waveform_lengths
+            )
+        content_dict = self.encode_content(content, task, device)
+        content, content_mask = content_dict["content"], content_dict[
+            "content_mask"]
+        if self.training and self.classifier_free_guidance:
+            mask_indices = [
+                k for k in range(len(waveform))
+                if random.random() < self.cfg_drop_ratio
+            ]
+            if len(mask_indices) > 0:
+                content[mask_indices] = 0
+        noisy_latent, target, timesteps = self.get_input_target_and_timesteps(
+            latent, training=self.training
+        )
+        pred: torch.Tensor = self.backbone(
+            x=noisy_latent,
+            timesteps=timesteps,
+            context=content,
+            x_mask=latent_mask,
+            context_mask=content_mask
+        )
+        loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+        loss = loss_with_mask(loss, latent_mask)
+        return loss
+    def iterative_denoise(
+        self, latent: torch.Tensor, timesteps: list[int], num_steps: int,
+        verbose: bool, cfg: bool, cfg_scale: float, backbone_input: dict
+    ):
+        progress_bar = tqdm(range(num_steps), disable=not verbose)
+        for i, timestep in enumerate(timesteps):
+            # expand the latent if we are doing classifier free guidance
+            if cfg:
+                latent_input = torch.cat([latent, latent])
+            else:
+                latent_input = latent
+            noise_pred: torch.Tensor = self.backbone(
+                x=latent_input, timesteps=timestep, **backbone_input
+            )
+            # perform guidance
+            if cfg:
+                noise_pred_uncond, noise_pred_content = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + cfg_scale * (
+                    noise_pred_content - noise_pred_uncond
+                )
+            latent = self.infer_noise_scheduler.step(
+                noise_pred, timestep, latent
+            ).prev_sample
+            progress_bar.update(1)
+        progress_bar.close()
+        return latent
+    @torch.no_grad()
+    def inference(
+        self,
+        content: list[Any],
+        condition: list[Any],
+        task: list[str],
+        latent_shape: Sequence[int],
+        num_steps: int = 50,
+        sway_sampling_coef: float | None = -1.0,
+        guidance_scale: float = 3.0,
+        num_samples_per_content: int = 1,
+        disable_progress: bool = True,
+        **kwargs
+    ):
+        device = self.dummy_param.device
+        classifier_free_guidance = guidance_scale > 1.0
+        batch_size = len(content) * num_samples_per_content
+        if classifier_free_guidance:
+            content, content_mask = self.encode_content_classifier_free(
+                content, task, num_samples_per_content
+            )
+        else:
+            content_output: dict[
+                str, torch.Tensor] = self.content_encoder.encode_content(
+                    content, task
+                )
+            content, content_mask = content_output["content"], content_output[
+                "content_mask"]
+            content = content.repeat_interleave(num_samples_per_content, 0)
+            content_mask = content_mask.repeat_interleave(
+                num_samples_per_content, 0
+            )
+        latent = self.prepare_latent(
+            batch_size, latent_shape, content.dtype, device
+        )
+        if not sway_sampling_coef:
+            sigmas = np.linspace(1.0, 1 / num_steps, num_steps)
+        else:
+            t = torch.linspace(0, 1, num_steps + 1)
+            t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+            sigmas = 1 - t
+        timesteps, num_steps = self.retrieve_timesteps(
+            num_steps, device, timesteps=None, sigmas=sigmas
+        )
+        latent = self.iterative_denoise(
+            latent=latent,
+            timesteps=timesteps,
+            num_steps=num_steps,
+            verbose=not disable_progress,
+            cfg=classifier_free_guidance,
+            cfg_scale=guidance_scale,
+            backbone_input={
+                "context": content,
+                "context_mask": content_mask,
+            },
+        )
+        waveform = self.autoencoder.decode(latent)
+        return waveform
+    def prepare_latent(
+        self, batch_size: int, latent_shape: Sequence[int], dtype: torch.dtype,
+        device: str
+    ):
+        shape = (batch_size, *latent_shape)
+        latent = randn_tensor(
+            shape, generator=None, device=device, dtype=dtype
+        )
+        return latent
+    def encode_content_classifier_free(
+        self,
+        content: list[Any],
+        task: list[str],
+        device,
+        num_samples_per_content: int = 1
+    ):
+        content_dict = self.content_encoder.encode_content(
+            content, task, device=device
+        )
+        content, content_mask = content_dict["content"], content_dict[
+            "content_mask"]
+        content = content.repeat_interleave(num_samples_per_content, 0)
+        content_mask = content_mask.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        # get unconditional embeddings for classifier free guidance
+        uncond_content = torch.zeros_like(content)
+        uncond_content_mask = content_mask.detach().clone()
+        uncond_content = uncond_content.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        uncond_content_mask = uncond_content_mask.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
+        content = torch.cat([uncond_content, content])
+        content_mask = torch.cat([uncond_content_mask, content_mask])
+        return content, content_mask
+class DurationAdapterMixin:
+    def __init__(
+        self,
+        latent_token_rate: int,
+        offset: float = 1.0,
+        frame_resolution: float | None = None
+    ):
+        self.latent_token_rate = latent_token_rate
+        self.offset = offset
+        self.frame_resolution = frame_resolution
+    def get_global_duration_loss(
+        self,
+        pred: torch.Tensor,
+        latent_mask: torch.Tensor,
+        reduce: bool = True,
+    ):
+        target = torch.log(
+            latent_mask.sum(1) / self.latent_token_rate + self.offset
+        )
+        loss = F.mse_loss(target, pred, reduction="mean" if reduce else "none")
+        return loss
+    def get_local_duration_loss(
+        self, ground_truth: torch.Tensor, pred: torch.Tensor,
+        mask: torch.Tensor, is_time_aligned: Sequence[bool], reduce: bool
+    ):
+        n_frames = torch.round(ground_truth / self.frame_resolution)
+        target = torch.log(n_frames + self.offset)
+        loss = loss_with_mask(
+            (target - pred)**2,
+            mask,
+            reduce=False,
+        )
+        loss *= is_time_aligned
+        if reduce:
+            if is_time_aligned.sum().item() == 0:
+                loss *= 0.0
+                loss = loss.mean()
+            else:
+                loss = loss.sum() / is_time_aligned.sum()
+        return loss
+    def prepare_local_duration(self, pred: torch.Tensor, mask: torch.Tensor):
+        pred = torch.exp(pred) * mask
+        pred = torch.ceil(pred) - self.offset
+        pred *= self.frame_resolution
+        return pred
+    def prepare_global_duration(
+        self,
+        global_pred: torch.Tensor,
+        local_pred: torch.Tensor,
+        is_time_aligned: Sequence[bool],
+        use_local: bool = True,
+    ):
+        """
+        global_pred: predicted duration value, processed by logarithmic and offset
+        local_pred: predicted latent length
+        """
+        global_pred = torch.exp(global_pred) - self.offset
+        result = global_pred
+        # avoid error accumulation for each frame
+        if use_local:
+            pred_from_local = torch.round(local_pred * self.latent_token_rate)
+            pred_from_local = pred_from_local.sum(1) / self.latent_token_rate
+            result[is_time_aligned] = pred_from_local[is_time_aligned]
+        return result
+    def expand_by_duration(
+        self,
+        x: torch.Tensor,
+        content_mask: torch.Tensor,
+        local_duration: torch.Tensor,
+        global_duration: torch.Tensor | None = None,
+    ):
+        n_latents = torch.round(local_duration * self.latent_token_rate)
+        if global_duration is not None:
+            latent_length = torch.round(
+                global_duration * self.latent_token_rate
+            )
+        else:
+            latent_length = n_latents.sum(1)
+        latent_mask = create_mask_from_length(latent_length).to(
+            content_mask.device
+        )
+        attn_mask = content_mask.unsqueeze(-1) * latent_mask.unsqueeze(1)
+        align_path = create_alignment_path(n_latents, attn_mask)
+        expanded_x = torch.matmul(align_path.transpose(1, 2).to(x.dtype), x)
+        return expanded_x, latent_mask
+class CrossAttentionAudioFlowMatching(
+    SingleTaskCrossAttentionAudioFlowMatching, DurationAdapterMixin
+):
+    def __init__(
+        self,
+        autoencoder: AutoEncoderBase,
+        content_encoder: ContentEncoder,
+        content_adapter: ContentAdapterBase,
+        backbone: nn.Module,
+        content_dim: int,
+        frame_resolution: float,
+        duration_offset: float = 1.0,
+        cfg_drop_ratio: float = 0.2,
+        sample_strategy: str = 'normal',
+        num_train_steps: int = 1000
+    ):
+        super().__init__(
+            autoencoder=autoencoder,
+            content_encoder=content_encoder,
+            backbone=backbone,
+            cfg_drop_ratio=cfg_drop_ratio,
+            sample_strategy=sample_strategy,
+            num_train_steps=num_train_steps,
+        )
+        ContentEncoderAdapterMixin.__init__(
+            self,
+            content_encoder=content_encoder,
+            content_adapter=content_adapter
+        )
+        DurationAdapterMixin.__init__(
+            self,
+            latent_token_rate=autoencoder.latent_token_rate,
+            offset=duration_offset
+        )
+    def encode_content_with_instruction(
+        self, content: list[Any], task: list[str], device,
+        instruction: torch.Tensor, instruction_lengths: torch.Tensor
+    ):
+        content_dict = self.encode_content(
+            content, task, device, instruction, instruction_lengths
+        )
+        return (
+            content_dict["content"],
+            content_dict["content_mask"],
+            content_dict["global_duration_pred"],
+            content_dict["local_duration_pred"],
+            content_dict["length_aligned_content"],
+        )
+    def forward(
+        self,
+        content: list[Any],
+        task: list[str],
+        waveform: torch.Tensor,
+        waveform_lengths: torch.Tensor,
+        instruction: torch.Tensor,
+        instruction_lengths: torch.Tensor,
+        loss_reduce: bool = True,
+        **kwargs
+    ):
+        device = self.dummy_param.device
+        loss_reduce = self.training or (loss_reduce and not self.training)
+        self.autoencoder.eval()
+        with torch.no_grad():
+            latent, latent_mask = self.autoencoder.encode(
+                waveform.unsqueeze(1), waveform_lengths
+            )
+        content, content_mask, global_duration_pred, _, _ = \
+            self.encode_content_with_instruction(
+                content, task, device, instruction, instruction_lengths
+            )
+        global_duration_loss = self.get_global_duration_loss(
+            global_duration_pred, latent_mask, reduce=loss_reduce
+        )
+        if self.training and self.classifier_free_guidance:
+            mask_indices = [
+                k for k in range(len(waveform))
+                if random.random() < self.cfg_drop_ratio
+            ]
+            if len(mask_indices) > 0:
+                content[mask_indices] = 0
+        noisy_latent, target, timesteps = self.get_input_target_and_timesteps(
+            latent, training=self.training
+        )
+        pred: torch.Tensor = self.backbone(
+            x=noisy_latent,
+            timesteps=timesteps,
+            context=content,
+            x_mask=latent_mask,
+            context_mask=content_mask,
+        )
+        pred = pred.transpose(1, self.autoencoder.time_dim)
+        target = target.transpose(1, self.autoencoder.time_dim)
+        diff_loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+        diff_loss = loss_with_mask(diff_loss, latent_mask, reduce=loss_reduce)
+        return {
+            "diff_loss": diff_loss,
+            "global_duration_loss": global_duration_loss,
+        }
+    @torch.no_grad()
+    def inference(
+        self,
+        content: list[Any],
+        condition: list[Any],
+        task: list[str],
+        is_time_aligned: Sequence[bool],
+        instruction: torch.Tensor,
+        instruction_lengths: torch.Tensor,
+        num_steps: int = 20,
+        sway_sampling_coef: float | None = -1.0,
+        guidance_scale: float = 3.0,
+        disable_progress=True,
+        use_gt_duration: bool = False,
+        **kwargs
+    ):
+        device = self.dummy_param.device
+        classifier_free_guidance = guidance_scale > 1.0
+        (
+            content,
+            content_mask,
+            global_duration_pred,
+            local_duration_pred,
+            _,
+        ) = self.encode_content_with_instruction(
+            content, task, device, instruction, instruction_lengths
+        )
+        batch_size = content.size(0)
+        if use_gt_duration:
+            raise NotImplementedError(
+                "Using ground truth global duration only is not implemented yet"
+            )
+        # prepare global duration
+        global_duration = self.prepare_global_duration(
+            global_duration_pred,
+            local_duration_pred,
+            is_time_aligned,
+            use_local=False
+        )
+        # TODO: manually set duration for SE and AudioSR
+        latent_length = torch.round(global_duration * self.latent_token_rate)
+        task_mask = torch.as_tensor([t in SAME_LENGTH_TASKS for t in task])
+        latent_length[task_mask] = content[task_mask].size(1)
+        latent_mask = create_mask_from_length(latent_length).to(device)
+        max_latent_length = latent_mask.sum(1).max().item()
+        # prepare latent and noise
+        if classifier_free_guidance:
+            uncond_context = torch.zeros_like(content)
+            uncond_content_mask = content_mask.detach().clone()
+            context = torch.cat([uncond_context, content])
+            context_mask = torch.cat([uncond_content_mask, content_mask])
+        else:
+            context = content
+            context_mask = content_mask
+        latent_shape = tuple(
+            max_latent_length if dim is None else dim
+            for dim in self.autoencoder.latent_shape
+        )
+        shape = (batch_size, *latent_shape)
+        latent = randn_tensor(
+            shape, generator=None, device=device, dtype=content.dtype
+        )
+        if not sway_sampling_coef:
+            sigmas = np.linspace(1.0, 1 / num_steps, num_steps)
+        else:
+            t = torch.linspace(0, 1, num_steps + 1)
+            t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+            sigmas = 1 - t
+        timesteps, num_steps = self.retrieve_timesteps(
+            num_steps, device, timesteps=None, sigmas=sigmas
+        )
+        latent = self.iterative_denoise(
+            latent=latent,
+            timesteps=timesteps,
+            num_steps=num_steps,
+            verbose=not disable_progress,
+            cfg=classifier_free_guidance,
+            cfg_scale=guidance_scale,
+            backbone_input={
+                "x_mask": latent_mask,
+                "context": context,
+                "context_mask": context_mask,
+            }
+        )
+        waveform = self.autoencoder.decode(latent)
+        return waveform
+class DummyContentAudioFlowMatching(CrossAttentionAudioFlowMatching):
+    def __init__(
+        self,
+        autoencoder: AutoEncoderBase,
+        content_encoder: ContentEncoder,
+        content_adapter: ContentAdapterBase,
+        backbone: nn.Module,
+        content_dim: int,
+        frame_resolution: float,
+        duration_offset: float = 1.0,
+        cfg_drop_ratio: float = 0.2,
+        sample_strategy: str = 'normal',
+        num_train_steps: int = 1000
+    ):
+        super().__init__(
+            autoencoder=autoencoder,
+            content_encoder=content_encoder,
+            content_adapter=content_adapter,
+            backbone=backbone,
+            content_dim=content_dim,
+            frame_resolution=frame_resolution,
+            duration_offset=duration_offset,
+            cfg_drop_ratio=cfg_drop_ratio,
+            sample_strategy=sample_strategy,
+            num_train_steps=num_train_steps
+        )
+        DurationAdapterMixin.__init__(
+            self,
+            latent_token_rate=autoencoder.latent_token_rate,
+            offset=duration_offset,
+            frame_resolution=frame_resolution
+        )
+        self.dummy_nta_embed = nn.Parameter(torch.zeros(content_dim))
+        self.dummy_ta_embed = nn.Parameter(torch.zeros(content_dim))
+    def get_backbone_input(
+        self, target_length: int, content: torch.Tensor,
+        content_mask: torch.Tensor, time_aligned_content: torch.Tensor,
+        length_aligned_content: torch.Tensor, is_time_aligned: torch.Tensor
+    ):
+        # TODO compatility for 2D spectrogram VAE
+        time_aligned_content = trim_or_pad_length(
+            time_aligned_content, target_length, 1
+        )
+        length_aligned_content = trim_or_pad_length(
+            length_aligned_content, target_length, 1
+        )
+        # time_aligned_content: from monotonic aligned input, without frame expansion (phoneme)
+        # length_aligned_content: from aligned input (f0/energy)
+        time_aligned_content = time_aligned_content + length_aligned_content
+        time_aligned_content[~is_time_aligned] = self.dummy_ta_embed.to(
+            time_aligned_content.dtype
+        )
+        context = content
+        context[is_time_aligned] = self.dummy_nta_embed.to(context.dtype)
+        # only use the first dummy non time aligned embedding
+        context_mask = content_mask.detach().clone()
+        context_mask[is_time_aligned, 1:] = False
+        # truncate dummy non time aligned context
+        if is_time_aligned.sum().item() < content.size(0):
+            trunc_nta_length = content_mask[~is_time_aligned].sum(1).max()
+        else:
+            trunc_nta_length = content.size(1)
+        context = context[:, :trunc_nta_length]
+        context_mask = context_mask[:, :trunc_nta_length]
+        return context, context_mask, time_aligned_content
+    def forward(
+        self,
+        content: list[Any],
+        duration: Sequence[float],
+        task: list[str],
+        is_time_aligned: Sequence[bool],
+        waveform: torch.Tensor,
+        waveform_lengths: torch.Tensor,
+        instruction: torch.Tensor,
+        instruction_lengths: torch.Tensor,
+        loss_reduce: bool = True,
+        **kwargs
+    ):
+        device = self.dummy_param.device
+        loss_reduce = self.training or (loss_reduce and not self.training)
+        self.autoencoder.eval()
+        with torch.no_grad():
+            latent, latent_mask = self.autoencoder.encode(
+                waveform.unsqueeze(1), waveform_lengths
+            )
+        (
+            content, content_mask, global_duration_pred, local_duration_pred,
+            length_aligned_content
+        ) = self.encode_content_with_instruction(
+            content, task, device, instruction, instruction_lengths
+        )
+        # truncate unused non time aligned duration prediction
+        if is_time_aligned.sum() > 0:
+            trunc_ta_length = content_mask[is_time_aligned].sum(1).max()
+        else:
+            trunc_ta_length = content.size(1)
+        # duration loss
+        local_duration_pred = local_duration_pred[:, :trunc_ta_length]
+        ta_content_mask = content_mask[:, :trunc_ta_length]
+        local_duration_loss = self.get_local_duration_loss(
+            duration,
+            local_duration_pred,
+            ta_content_mask,
+            is_time_aligned,
+            reduce=loss_reduce
+        )
+        global_duration_loss = self.get_global_duration_loss(
+            global_duration_pred, latent_mask, reduce=loss_reduce
+        )
+        # --------------------------------------------------------------------
+        # prepare latent and noise
+        # --------------------------------------------------------------------
+        noisy_latent, target, timesteps = self.get_input_target_and_timesteps(
+            latent, training=self.training
+        )
+        # --------------------------------------------------------------------
+        # duration adapter
+        # --------------------------------------------------------------------
+        if is_time_aligned.sum() == 0 and \
+            duration.size(1) < content_mask.size(1):
+            duration = F.pad(
+                duration, (0, content_mask.size(1) - duration.size(1))
+            )
+        time_aligned_content, _ = self.expand_by_duration(
+            x=content[:, :trunc_ta_length],
+            content_mask=ta_content_mask,
+            local_duration=duration,
+        )
+        # --------------------------------------------------------------------
+        # prepare input to the backbone
+        # --------------------------------------------------------------------
+        # TODO compatility for 2D spectrogram VAE
+        latent_length = noisy_latent.size(self.autoencoder.time_dim)
+        context, context_mask, time_aligned_content = self.get_backbone_input(
+            latent_length, content, content_mask, time_aligned_content,
+            length_aligned_content, is_time_aligned
+        )
+        # --------------------------------------------------------------------
+        # classifier free guidance
+        # --------------------------------------------------------------------
+        if self.training and self.classifier_free_guidance:
+            mask_indices = [
+                k for k in range(len(waveform))
+                if random.random() < self.cfg_drop_ratio
+            ]
+            if len(mask_indices) > 0:
+                context[mask_indices] = 0
+                time_aligned_content[mask_indices] = 0
+        pred: torch.Tensor = self.backbone(
+            x=noisy_latent,
+            x_mask=latent_mask,
+            timesteps=timesteps,
+            context=context,
+            context_mask=context_mask,
+            time_aligned_context=time_aligned_content,
+        )
+        pred = pred.transpose(1, self.autoencoder.time_dim)
+        target = target.transpose(1, self.autoencoder.time_dim)
+        diff_loss = F.mse_loss(pred, target, reduction="none")
+        diff_loss = loss_with_mask(diff_loss, latent_mask, reduce=loss_reduce)
+        return {
+            "diff_loss": diff_loss,
+            "local_duration_loss": local_duration_loss,
+            "global_duration_loss": global_duration_loss,
+        }
+    def inference(
+        self,
+        content: list[Any],
+        task: list[str],
+        is_time_aligned: Sequence[bool],
+        instruction: torch.Tensor,
+        instruction_lengths: Sequence[int],
+        num_steps: int = 20,
+        sway_sampling_coef: float | None = -1.0,
+        guidance_scale: float = 3.0,
+        disable_progress: bool = True,
+        use_gt_duration: bool = False,
+        **kwargs
+    ):
+        device = self.dummy_param.device
+        classifier_free_guidance = guidance_scale > 1.0
+        (
+            content, content_mask, global_duration_pred, local_duration_pred,
+            length_aligned_content
+        ) = self.encode_content_with_instruction(
+            content, task, device, instruction, instruction_lengths
+        )
+        # print("content std: ", content.std())
+        batch_size = content.size(0)
+        # truncate dummy time aligned duration prediction
+        is_time_aligned = torch.as_tensor(is_time_aligned)
+        if is_time_aligned.sum() > 0:
+            trunc_ta_length = content_mask[is_time_aligned].sum(1).max()
+        else:
+            trunc_ta_length = content.size(1)
+        # prepare local duration
+        local_duration = self.prepare_local_duration(
+            local_duration_pred, content_mask
+        )
+        local_duration = local_duration[:, :trunc_ta_length]
+        # use ground truth duration
+        if use_gt_duration and "duration" in kwargs:
+            local_duration = torch.as_tensor(kwargs["duration"]).to(device)
+        # prepare global duration
+        global_duration = self.prepare_global_duration(
+            global_duration_pred, local_duration, is_time_aligned
+        )
+        # --------------------------------------------------------------------
+        # duration adapter
+        # --------------------------------------------------------------------
+        time_aligned_content, latent_mask = self.expand_by_duration(
+            x=content[:, :trunc_ta_length],
+            content_mask=content_mask[:, :trunc_ta_length],
+            local_duration=local_duration,
+            global_duration=global_duration,
+        )
+        context, context_mask, time_aligned_content = self.get_backbone_input(
+            target_length=time_aligned_content.size(1),
+            content=content,
+            content_mask=content_mask,
+            time_aligned_content=time_aligned_content,
+            length_aligned_content=length_aligned_content,
+            is_time_aligned=is_time_aligned
+        )
+        # --------------------------------------------------------------------
+        # prepare unconditional input
+        # --------------------------------------------------------------------
+        if classifier_free_guidance:
+            uncond_time_aligned_content = torch.zeros_like(
+                time_aligned_content
+            )
+            uncond_context = torch.zeros_like(context)
+            uncond_context_mask = context_mask.detach().clone()
+            time_aligned_content = torch.cat([
+                uncond_time_aligned_content, time_aligned_content
+            ])
+            context = torch.cat([uncond_context, context])
+            context_mask = torch.cat([uncond_context_mask, context_mask])
+            latent_mask = torch.cat([
+                latent_mask, latent_mask.detach().clone()
+            ])
+        # --------------------------------------------------------------------
+        # prepare input to the backbone
+        # --------------------------------------------------------------------
+        latent_length = latent_mask.sum(1).max().item()
+        latent_shape = tuple(
+            latent_length if dim is None else dim
+            for dim in self.autoencoder.latent_shape
+        )
+        shape = (batch_size, *latent_shape)
+        latent = randn_tensor(
+            shape, generator=None, device=device, dtype=content.dtype
+        )
+        if not sway_sampling_coef:
+            sigmas = np.linspace(1.0, 1 / num_steps, num_steps)
+        else:
+            t = torch.linspace(0, 1, num_steps + 1)
+            t = t + sway_sampling_coef * (torch.cos(torch.pi / 2 * t) - 1 + t)
+            sigmas = 1 - t
+        timesteps, num_steps = self.retrieve_timesteps(
+            num_steps, device, timesteps=None, sigmas=sigmas
+        )
+        latent = self.iterative_denoise(
+            latent=latent,
+            timesteps=timesteps,
+            num_steps=num_steps,
+            verbose=not disable_progress,
+            cfg=classifier_free_guidance,
+            cfg_scale=guidance_scale,
+            backbone_input={
+                "x_mask": latent_mask,
+                "context": context,
+                "context_mask": context_mask,
+                "time_aligned_context": time_aligned_content,
+            }
+        )
+        waveform = self.autoencoder.decode(latent)
+        return waveform
+class DoubleContentAudioFlowMatching(DummyContentAudioFlowMatching):
+    def get_backbone_input(
+        self, target_length: int, content: torch.Tensor,
+        content_mask: torch.Tensor, time_aligned_content: torch.Tensor,
+        length_aligned_content: torch.Tensor, is_time_aligned: torch.Tensor
+    ):
+        # TODO compatility for 2D spectrogram VAE
+        time_aligned_content = trim_or_pad_length(
+            time_aligned_content, target_length, 1
+        )
+        context_length = min(content.size(1), time_aligned_content.size(1))
+        time_aligned_content[~is_time_aligned, :context_length] = content[
+            ~is_time_aligned, :context_length]
+        length_aligned_content = trim_or_pad_length(
+            length_aligned_content, target_length, 1
+        )
+        # time_aligned_content: from monotonic aligned input, without frame expansion (phoneme)
+        # length_aligned_content: from aligned input (f0/energy)
+        time_aligned_content = time_aligned_content + length_aligned_content
+        context = content
+        context_mask = content_mask.detach().clone()
+        return context, context_mask, time_aligned_content
+class HybridContentAudioFlowMatching(DummyContentAudioFlowMatching):
+    def get_backbone_input(
+        self, target_length: int, content: torch.Tensor,
+        content_mask: torch.Tensor, time_aligned_content: torch.Tensor,
+        length_aligned_content: torch.Tensor, is_time_aligned: torch.Tensor
+    ):
+        # TODO compatility for 2D spectrogram VAE
+        time_aligned_content = trim_or_pad_length(
+            time_aligned_content, target_length, 1
+        )
+        length_aligned_content = trim_or_pad_length(
+            length_aligned_content, target_length, 1
+        )
+        # time_aligned_content: from monotonic aligned input, without frame expansion (phoneme)
+        # length_aligned_content: from aligned input (f0/energy)
+        time_aligned_content = time_aligned_content + length_aligned_content
+        time_aligned_content[~is_time_aligned] = self.dummy_ta_embed.to(
+            time_aligned_content.dtype
+        )
+        context = content
+        context_mask = content_mask.detach().clone()
+        return context, context_mask, time_aligned_content

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+gradio==4.26.0
+# --- Core Framework (Pinned Versions) ---
+torch==2.5.1
+torchvision==0.20.1
+torchaudio==2.5.1
+# --- Deep Learning & Utilities ---
+diffusers
+transformers
+accelerate
+einops
+alias_free_torch
+tqdm
+torchdata
+# --- Config & Data ---
+hydra-core
+omegaconf
+h5py
+# --- Audio ---
+librosa
+soundfile
+# --- Logging ---
+wandb
+tensorboard
+swanlab

stabilityai/stable-diffusion-2-1/scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "_class_name": "DDIMScheduler",
+    "_diffusers_version": "0.8.0",
+    "beta_end": 0.012,
+    "beta_schedule": "scaled_linear",
+    "beta_start": 0.00085,
+    "clip_sample": false,
+    "num_train_timesteps": 1000,
+    "prediction_type": "v_prediction",
+    "set_alpha_to_one": false,
+    "skip_prk_steps": true,
+    "steps_offset": 1,
+    "trained_betas": null
+  }

utils/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (1.7 kB). View file

utils/__pycache__/torch_utilities.cpython-310.pyc ADDED Viewed

Binary file (8.33 kB). View file

utils/accelerate_utilities.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from accelerate import Accelerator
+class AcceleratorSaveTrainableParams(Accelerator):
+    def get_state_dict(self, model, unwrap=True):
+        state_dict = super().get_state_dict(model, unwrap)
+        if hasattr(model, "param_names_to_save"):
+            param_names_to_save = model.param_names_to_save
+            return {
+                k: v
+                for k, v in state_dict.items() if k in param_names_to_save
+            }
+        return state_dict

utils/audio.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn as nn
+import torchaudio
+class PadCrop(nn.Module):
+    def __init__(self, n_samples, randomize=True):
+        super().__init__()
+        self.n_samples = n_samples
+        self.randomize = randomize
+    def __call__(self, signal):
+        n, s = signal.shape
+        start = 0 if (
+            not self.randomize
+        ) else torch.randint(0,
+                             max(0, s - self.n_samples) + 1, []).item()
+        end = start + self.n_samples
+        output = signal.new_zeros([n, self.n_samples])
+        output[:, :min(s, self.n_samples)] = signal[:, start:end]
+        return output
+def set_audio_channels(audio, target_channels):
+    if target_channels == 1:
+        # Convert to mono
+        audio = audio.mean(1, keepdim=True)
+    elif target_channels == 2:
+        # Convert to stereo
+        if audio.shape[1] == 1:
+            audio = audio.repeat(1, 2, 1)
+        elif audio.shape[1] > 2:
+            audio = audio[:, :2, :]
+    return audio
+def prepare_audio(
+    audio, in_sr, target_sr, target_length, target_channels, device
+):
+    audio = audio.to(device)
+    if in_sr != target_sr:
+        resample_tf = torchaudio.transforms.Resample(in_sr,
+                                                     target_sr).to(device)
+        audio = resample_tf(audio)
+    audio = PadCrop(target_length, randomize=False)(audio)
+    # Add batch dimension
+    if audio.dim() == 1:
+        audio = audio.unsqueeze(0).unsqueeze(0)
+    elif audio.dim() == 2:
+        audio = audio.unsqueeze(0)
+    audio = set_audio_channels(audio, target_channels)
+    return audio

utils/config.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from pathlib import Path
+import sys
+import os
+import hydra
+import omegaconf
+from omegaconf import OmegaConf
+def multiply(*args):
+    result = 1
+    for arg in args:
+        result *= arg
+    return result
+def get_pitch_downsample_ratio(
+    autoencoder_config: dict, pitch_frame_resolution: float
+):
+    latent_frame_resolution = autoencoder_config[
+        "downsampling_ratio"] / autoencoder_config["sample_rate"]
+    return round(latent_frame_resolution / pitch_frame_resolution)
+def register_omegaconf_resolvers() -> None:
+    """
+    Register custom resolver for hydra configs, which can be used in YAML
+    files for dynamically setting values
+    """
+    OmegaConf.clear_resolvers()
+    OmegaConf.register_new_resolver("len", len, replace=True)
+    OmegaConf.register_new_resolver("multiply", multiply, replace=True)
+    OmegaConf.register_new_resolver(
+        "get_pitch_downsample_ratio", get_pitch_downsample_ratio, replace=True
+    )
+def generate_config_from_command_line_overrides(
+    config_file: str | Path
+) -> omegaconf.DictConfig:
+    register_omegaconf_resolvers()
+    config_file = Path(config_file).resolve()
+    config_name = config_file.name.__str__()
+    config_path = config_file.parent.__str__()
+    config_path = os.path.relpath(config_path, Path(__file__).resolve().parent)
+    overrides = sys.argv[1:]
+    with hydra.initialize(version_base=None, config_path=config_path):
+        config = hydra.compose(config_name=config_name, overrides=overrides)
+    omegaconf.OmegaConf.resolve(config)
+    return config

utils/diffsinger_utilities.py ADDED Viewed

	@@ -0,0 +1,551 @@

+import six
+from pathlib import Path
+import re
+import json
+from collections import OrderedDict
+from typing import Union
+import numpy as np
+import librosa
+import torch
+PAD = "<pad>"
+EOS = "<EOS>"
+UNK = "<UNK>"
+SEG = "|"
+RESERVED_TOKENS = [PAD, EOS, UNK]
+NUM_RESERVED_TOKENS = len(RESERVED_TOKENS)
+PAD_ID = RESERVED_TOKENS.index(PAD)  # Normally 0
+EOS_ID = RESERVED_TOKENS.index(EOS)  # Normally 1
+UNK_ID = RESERVED_TOKENS.index(UNK)  # Normally 2
+F0_BIN = 256
+F0_MAX = 1100.0
+F0_MIN = 50.0
+F0_MEL_MIN = 1127 * np.log(1 + F0_MIN / 700)
+F0_MEL_MAX = 1127 * np.log(1 + F0_MAX / 700)
+def f0_to_coarse(f0):
+    is_torch = isinstance(f0, torch.Tensor)
+    f0_mel = 1127 * (1 + f0 /
+                     700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
+    f0_mel[f0_mel > 0
+          ] = (f0_mel[f0_mel > 0] -
+               F0_MEL_MIN) * (F0_BIN - 2) / (F0_MEL_MAX - F0_MEL_MIN) + 1
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > F0_BIN - 1] = F0_BIN - 1
+    f0_coarse = (f0_mel +
+                 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
+    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
+        f0_coarse.max(), f0_coarse.min()
+    )
+    return f0_coarse
+def norm_f0(
+    f0: Union[np.ndarray, torch.Tensor],
+    uv: Union[None, np.ndarray],
+    f0_mean: float,
+    f0_std: float,
+    pitch_norm: str = "log",
+    use_uv: bool = True
+):
+    is_torch = isinstance(f0, torch.Tensor)
+    if pitch_norm == 'standard':
+        f0 = (f0 - f0_mean) / f0_std
+    if pitch_norm == 'log':
+        f0 = torch.log2(f0) if is_torch else np.log2(f0)
+    if uv is not None and use_uv:
+        f0[uv > 0] = 0
+    return f0
+def norm_interp_f0(
+    f0: Union[np.ndarray, torch.Tensor],
+    f0_mean: float,
+    f0_std: float,
+    pitch_norm: str = "log",
+    use_uv: bool = True
+):
+    is_torch = isinstance(f0, torch.Tensor)
+    if is_torch:
+        device = f0.device
+        f0 = f0.data.cpu().numpy()
+    uv = f0 == 0
+    f0 = norm_f0(f0, uv, f0_mean, f0_std, pitch_norm, use_uv)
+    if sum(uv) == len(f0):
+        f0[uv] = 0
+    elif sum(uv) > 0:
+        f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
+    uv = torch.as_tensor(uv).float()
+    f0 = torch.as_tensor(f0).float()
+    if is_torch:
+        f0 = f0.to(device)
+    return f0, uv
+def denorm_f0(
+    f0,
+    uv,
+    pitch_norm="log",
+    f0_mean=None,
+    f0_std=None,
+    pitch_padding=None,
+    min=None,
+    max=None,
+    use_uv=True
+):
+    if pitch_norm == 'standard':
+        f0 = f0 * f0_std + f0_mean
+    if pitch_norm == 'log':
+        f0 = 2**f0
+    if min is not None:
+        f0 = f0.clamp(min=min)
+    if max is not None:
+        f0 = f0.clamp(max=max)
+    if uv is not None and use_uv:
+        f0[uv > 0] = 0
+    if pitch_padding is not None:
+        f0[pitch_padding] = 0
+    return f0
+def librosa_pad_lr(x, fshift, pad_sides=1):
+    '''compute right padding (final frame) or both sides padding (first and final frames)
+    '''
+    assert pad_sides in (1, 2)
+    # return int(fsize // 2)
+    pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+    if pad_sides == 1:
+        return 0, pad
+    else:
+        return pad // 2, pad // 2 + pad % 2
+def get_pitch(
+    wav_file: Union[str, Path], sample_rate: int, frame_shift: float
+):
+    import parselmouth
+    hop_size = int(frame_shift * sample_rate)
+    wav, _ = librosa.core.load(wav_file, sr=sample_rate)
+    # l_pad, r_pad = librosa_pad_lr(wav, hop_size, 1)
+    # wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
+    latent_length = wav.shape[0] // hop_size
+    f0_min = 80
+    f0_max = 750
+    pad_size = 4
+    f0 = parselmouth.Sound(wav, sample_rate).to_pitch_ac(
+        time_step=frame_shift,
+        voicing_threshold=0.6,
+        pitch_floor=f0_min,
+        pitch_ceiling=f0_max
+    ).selected_array['frequency']
+    delta_l = latent_length - len(f0)
+    if delta_l > 0:
+        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
+    pitch_coarse = f0_to_coarse(f0)
+    return f0, pitch_coarse
+def remove_empty_lines(text):
+    """remove empty lines"""
+    assert (len(text) > 0)
+    assert (isinstance(text, list))
+    text = [t.strip() for t in text]
+    if "" in text:
+        text.remove("")
+    return text
+def is_sil_phoneme(p):
+    return not p[0].isalpha()
+def strip_ids(ids, ids_to_strip):
+    """Strip ids_to_strip from the end ids."""
+    ids = list(ids)
+    while ids and ids[-1] in ids_to_strip:
+        ids.pop()
+    return ids
+class TextEncoder(object):
+    """Base class for converting from ints to/from human readable strings."""
+    def __init__(self, num_reserved_ids=NUM_RESERVED_TOKENS):
+        self._num_reserved_ids = num_reserved_ids
+    @property
+    def num_reserved_ids(self):
+        return self._num_reserved_ids
+    def encode(self, s):
+        """Transform a human-readable string into a sequence of int ids.
+        The ids should be in the range [num_reserved_ids, vocab_size). Ids [0,
+        num_reserved_ids) are reserved.
+        EOS is not appended.
+        Args:
+        s: human-readable string to be converted.
+        Returns:
+        ids: list of integers
+        """
+        return [int(w) + self._num_reserved_ids for w in s.split()]
+    def decode(self, ids, strip_extraneous=False):
+        """Transform a sequence of int ids into a human-readable string.
+        EOS is not expected in ids.
+        Args:
+        ids: list of integers to be converted.
+        strip_extraneous: bool, whether to strip off extraneous tokens
+            (EOS and PAD).
+        Returns:
+        s: human-readable string.
+        """
+        if strip_extraneous:
+            ids = strip_ids(ids, list(range(self._num_reserved_ids or 0)))
+        return " ".join(self.decode_list(ids))
+    def decode_list(self, ids):
+        """Transform a sequence of int ids into a their string versions.
+        This method supports transforming individual input/output ids to their
+        string versions so that sequence to/from text conversions can be visualized
+        in a human readable format.
+        Args:
+        ids: list of integers to be converted.
+        Returns:
+        strs: list of human-readable string.
+        """
+        decoded_ids = []
+        for id_ in ids:
+            if 0 <= id_ < self._num_reserved_ids:
+                decoded_ids.append(RESERVED_TOKENS[int(id_)])
+            else:
+                decoded_ids.append(id_ - self._num_reserved_ids)
+        return [str(d) for d in decoded_ids]
+    @property
+    def vocab_size(self):
+        raise NotImplementedError()
+class TokenTextEncoder(TextEncoder):
+    """Encoder based on a user-supplied vocabulary (file or list)."""
+    def __init__(
+        self,
+        vocab_filename,
+        reverse=False,
+        vocab_list=None,
+        replace_oov=None,
+        num_reserved_ids=NUM_RESERVED_TOKENS
+    ):
+        """Initialize from a file or list, one token per line.
+        Handling of reserved tokens works as follows:
+        - When initializing from a list, we add reserved tokens to the vocab.
+        - When initializing from a file, we do not add reserved tokens to the vocab.
+        - When saving vocab files, we save reserved tokens to the file.
+        Args:
+            vocab_filename: If not None, the full filename to read vocab from. If this
+                is not None, then vocab_list should be None.
+            reverse: Boolean indicating if tokens should be reversed during encoding
+                and decoding.
+            vocab_list: If not None, a list of elements of the vocabulary. If this is
+                not None, then vocab_filename should be None.
+            replace_oov: If not None, every out-of-vocabulary token seen when
+                encoding will be replaced by this string (which must be in vocab).
+            num_reserved_ids: Number of IDs to save for reserved tokens like <EOS>.
+        """
+        super(TokenTextEncoder,
+              self).__init__(num_reserved_ids=num_reserved_ids)
+        self._reverse = reverse
+        self._replace_oov = replace_oov
+        if vocab_filename:
+            self._init_vocab_from_file(vocab_filename)
+        else:
+            assert vocab_list is not None
+            self._init_vocab_from_list(vocab_list)
+        self.pad_index = self._token_to_id[PAD]
+        self.eos_index = self._token_to_id[EOS]
+        self.unk_index = self._token_to_id[UNK]
+        self.seg_index = self._token_to_id[
+            SEG] if SEG in self._token_to_id else self.eos_index
+    def encode(self, s):
+        """Converts a space-separated string of tokens to a list of ids."""
+        sentence = s
+        tokens = sentence.strip().split()
+        if self._replace_oov is not None:
+            tokens = [
+                t if t in self._token_to_id else self._replace_oov
+                for t in tokens
+            ]
+        ret = [self._token_to_id[tok] for tok in tokens]
+        return ret[::-1] if self._reverse else ret
+    def decode(self, ids, strip_eos=False, strip_padding=False):
+        if strip_padding and self.pad() in list(ids):
+            pad_pos = list(ids).index(self.pad())
+            ids = ids[:pad_pos]
+        if strip_eos and self.eos() in list(ids):
+            eos_pos = list(ids).index(self.eos())
+            ids = ids[:eos_pos]
+        return " ".join(self.decode_list(ids))
+    def decode_list(self, ids):
+        seq = reversed(ids) if self._reverse else ids
+        return [self._safe_id_to_token(i) for i in seq]
+    @property
+    def vocab_size(self):
+        return len(self._id_to_token)
+    def __len__(self):
+        return self.vocab_size
+    def _safe_id_to_token(self, idx):
+        return self._id_to_token.get(idx, "ID_%d" % idx)
+    def _init_vocab_from_file(self, filename):
+        """Load vocab from a file.
+        Args:
+        filename: The file to load vocabulary from.
+        """
+        with open(filename) as f:
+            tokens = [token.strip() for token in f.readlines()]
+        def token_gen():
+            for token in tokens:
+                yield token
+        self._init_vocab(token_gen(), add_reserved_tokens=False)
+    def _init_vocab_from_list(self, vocab_list):
+        """Initialize tokens from a list of tokens.
+        It is ok if reserved tokens appear in the vocab list. They will be
+        removed. The set of tokens in vocab_list should be unique.
+        Args:
+        vocab_list: A list of tokens.
+        """
+        def token_gen():
+            for token in vocab_list:
+                if token not in RESERVED_TOKENS:
+                    yield token
+        self._init_vocab(token_gen())
+    def _init_vocab(self, token_generator, add_reserved_tokens=True):
+        """Initialize vocabulary with tokens from token_generator."""
+        self._id_to_token = {}
+        non_reserved_start_index = 0
+        if add_reserved_tokens:
+            self._id_to_token.update(enumerate(RESERVED_TOKENS))
+            non_reserved_start_index = len(RESERVED_TOKENS)
+        self._id_to_token.update(
+            enumerate(token_generator, start=non_reserved_start_index)
+        )
+        # _token_to_id is the reverse of _id_to_token
+        self._token_to_id = dict(
+            (v, k) for k, v in six.iteritems(self._id_to_token)
+        )
+    def pad(self):
+        return self.pad_index
+    def eos(self):
+        return self.eos_index
+    def unk(self):
+        return self.unk_index
+    def seg(self):
+        return self.seg_index
+    def store_to_file(self, filename):
+        """Write vocab file to disk.
+        Vocab files have one token per line. The file ends in a newline. Reserved
+        tokens are written to the vocab file as well.
+        Args:
+        filename: Full path of the file to store the vocab to.
+        """
+        with open(filename, "w") as f:
+            for i in range(len(self._id_to_token)):
+                f.write(self._id_to_token[i] + "\n")
+    def sil_phonemes(self):
+        return [p for p in self._id_to_token.values() if not p[0].isalpha()]
+class TextGrid(object):
+    def __init__(self, text):
+        text = remove_empty_lines(text)
+        self.text = text
+        self.line_count = 0
+        self._get_type()
+        self._get_time_intval()
+        self._get_size()
+        self.tier_list = []
+        self._get_item_list()
+    def _extract_pattern(self, pattern, inc):
+        """
+        Parameters
+        ----------
+        pattern : regex to extract pattern
+        inc : increment of line count after extraction
+        Returns
+        -------
+        group : extracted info
+        """
+        try:
+            group = re.match(pattern, self.text[self.line_count]).group(1)
+            self.line_count += inc
+        except AttributeError:
+            raise ValueError(
+                "File format error at line %d:%s" %
+                (self.line_count, self.text[self.line_count])
+            )
+        return group
+    def _get_type(self):
+        self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
+    def _get_time_intval(self):
+        self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
+        self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
+    def _get_size(self):
+        self.size = int(self._extract_pattern(r"size = (.*)", 2))
+    def _get_item_list(self):
+        """Only supports IntervalTier currently"""
+        for itemIdx in range(1, self.size + 1):
+            tier = OrderedDict()
+            item_list = []
+            tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
+            tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
+            if tier_class != "IntervalTier":
+                raise NotImplementedError(
+                    "Only IntervalTier class is supported currently"
+                )
+            tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
+            tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
+            tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
+            tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
+            for i in range(int(tier_size)):
+                item = OrderedDict()
+                item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
+                item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
+                item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
+                item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
+                item_list.append(item)
+            tier["idx"] = tier_idx
+            tier["class"] = tier_class
+            tier["name"] = tier_name
+            tier["xmin"] = tier_xmin
+            tier["xmax"] = tier_xmax
+            tier["size"] = tier_size
+            tier["items"] = item_list
+            self.tier_list.append(tier)
+    def toJson(self):
+        _json = OrderedDict()
+        _json["file_type"] = self.file_type
+        _json["xmin"] = self.xmin
+        _json["xmax"] = self.xmax
+        _json["size"] = self.size
+        _json["tiers"] = self.tier_list
+        return json.dumps(_json, ensure_ascii=False, indent=2)
+def read_duration_from_textgrid(
+    textgrid_path: Union[str, Path],
+    phoneme: str,
+    utterance_duration: float,
+):
+    ph_list = phoneme.split(" ")
+    with open(textgrid_path, "r") as f:
+        textgrid = f.readlines()
+    textgrid = remove_empty_lines(textgrid)
+    textgrid = TextGrid(textgrid)
+    textgrid = json.loads(textgrid.toJson())
+    split = np.ones(len(ph_list) + 1, np.float32) * -1
+    tg_idx = 0
+    ph_idx = 0
+    tg_align = [x for x in textgrid['tiers'][-1]['items']]
+    tg_align_ = []
+    for x in tg_align:
+        x['xmin'] = float(x['xmin'])
+        x['xmax'] = float(x['xmax'])
+        if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC', '<SP>', '<AP>']:
+            x['text'] = ''
+            if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
+                tg_align_[-1]['xmax'] = x['xmax']
+                continue
+        tg_align_.append(x)
+    tg_align = tg_align_
+    tg_len = len([x for x in tg_align if x['text'] != ''])
+    ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
+    assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, textgrid_path)
+    while tg_idx < len(tg_align) or ph_idx < len(ph_list):
+        if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
+            split[ph_idx] = 1e8
+            ph_idx += 1
+            continue
+        x = tg_align[tg_idx]
+        if x['text'] == '' and ph_idx == len(ph_list):
+            tg_idx += 1
+            continue
+        assert ph_idx < len(ph_list), (
+            tg_len, ph_len, tg_align, ph_list, textgrid_path
+        )
+        ph = ph_list[ph_idx]
+        if x['text'] == '' and not is_sil_phoneme(ph):
+            assert False, (ph_list, tg_align)
+        if x['text'] != '' and is_sil_phoneme(ph):
+            ph_idx += 1
+        else:
+            assert (x['text'] == '' and is_sil_phoneme(ph)) \
+                   or x['text'].lower() == ph.lower() \
+                   or x['text'].lower() == 'sil', (x['text'], ph)
+            split[ph_idx] = x['xmin']
+            if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(
+                ph_list[ph_idx - 1]
+            ):
+                split[ph_idx - 1] = split[ph_idx]
+            ph_idx += 1
+            tg_idx += 1
+    assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
+    assert ph_idx >= len(ph_list) - 1, (
+        ph_idx, ph_list, len(ph_list), [x['text']
+                                        for x in tg_align], textgrid_path
+    )
+    split[0] = 0
+    split[-1] = utterance_duration
+    duration = np.diff(split)
+    return duration

utils/general.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import json
+import re
+from typing import Union, Dict
+from pathlib import Path
+import os
+MAX_FILE_NAME_LENGTH = 100
+def read_jsonl_to_mapping(
+    jsonl_file: Union[str, Path],
+    key_col: str,
+    value_col: str,
+    base_path=None
+) -> Dict[str, str]:
+    """
+    Read two columns, indicated by `key_col` and `value_col`, from the
+    given jsonl file to return the mapping dict
+    TODO handle duplicate keys
+    """
+    mapping = {}
+    with open(jsonl_file, 'r') as file:
+        for line in file.readlines():
+            data = json.loads(line.strip())
+            key = data[key_col]
+            value = data[value_col]
+            if base_path:
+                value = os.path.join(base_path, value)
+            mapping[key] = value
+    return mapping
+def sanitize_filename(name: str, max_len: int = MAX_FILE_NAME_LENGTH) -> str:
+    """
+    Clean and truncate a string to make it a valid and safe filename.
+    """
+    name = re.sub(r'[\\/*?:"<>|]', '_', name)
+    name = name.replace('/', '_')
+    max_len = min(len(name), max_len)
+    return name[:max_len]
+def transform_gen_fn_to_id(audio_file: Path, task: str) -> str:
+    if task == "svs":
+        audio_id = audio_file.stem.split("_")[0]
+    elif task == "sr":
+        audio_id = audio_file.stem
+    elif task == "tta":
+        audio_id = audio_file.stem[:11]
+        # audio_id = audio_file.stem[:12] + '.wav'
+    elif task == "ttm":
+        audio_id = audio_file.stem[:11]
+        # audio_id = audio_file.stem[:12] + '.wav'
+    elif task == "v2a":
+        audio_id = audio_file.stem.rsplit("_", 1)[0] + ".mp4"
+    else:
+        audio_id = audio_file.stem
+    return audio_id
+def audio_dir_to_mapping(audio_dir: str | Path, task: str) -> dict:
+    mapping = {}
+    audio_dir = Path(audio_dir)
+    audio_files = sorted(audio_dir.iterdir())
+    for audio_file in audio_files:
+        audio_id = transform_gen_fn_to_id(audio_file, task)
+        mapping[audio_id] = str(audio_file.resolve())
+    return mapping

utils/logging.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from pathlib import Path
+from dataclasses import dataclass
+import logging
+@dataclass
+class LoggingLogger:
+    filename: str | Path
+    level: str = "INFO"
+    def create_instance(self, ):
+        filename = self.filename.__str__()
+        formatter = logging.Formatter("[%(asctime)s] - %(message)s")
+        logger = logging.getLogger(__name__ + "." + filename)
+        logger.setLevel(getattr(logging, self.level))
+        file_handler = logging.FileHandler(filename)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+        return logger

utils/lr_scheduler_utilities.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from typing import Any
+import math
+import copy
+from torch.utils.data import DataLoader
+def get_warmup_steps(
+    dataloader_one_pass_outside_steps: int,
+    warmup_steps: int | None = None,
+    warmup_epochs: float | None = None,
+    epoch_length: int | None = None,
+) -> int:
+    """
+    Derive warmup steps according to step number or epoch number.
+    If `warmup_steps` is provided, then just return it. Otherwise, derive
+    the warmup steps by epoch length and warmup epoch number.
+    """
+    if warmup_steps is not None:
+        return warmup_steps
+    else:
+        if epoch_length is None:
+            epoch_length = dataloader_one_pass_outside_steps
+        assert warmup_epochs is not None, "warmup_steps and warmup_epochs cannot be both None"
+        return int(epoch_length * warmup_epochs)
+def get_dataloader_one_pass_outside_steps(
+    train_dataloader: DataLoader,
+    num_processes: int = 1,
+):
+    """
+    dataloader length after DDP, close to `original_length / gpu_number`
+    """
+    return math.ceil(len(train_dataloader) / num_processes)
+def get_total_training_steps(
+    train_dataloader: DataLoader,
+    epochs: int,
+    num_processes: int = 1,
+    epoch_length: int | None = None
+):
+    """
+    Calculate the total number of "visible" training steps.
+    If `epoch_length` is provided, it is used as the fixed length for each epoch.
+    Otherwise, the function will determine the epoch length from `train_dataloader`.
+    Args:
+        train_dataloader:
+            Training dataloader object.
+        epochs:
+            The total number of epochs to run.
+        num_processes:
+            The number of parallel processes used for distributed training.
+        epoch_length:
+            A fixed number of training steps for each epoch. Defaults to None.
+    Returns:
+        int: The total number of training steps (i.e., `epochs * epoch_length`).
+    """
+    # `epoch_length` is not None: fixed length for each epoch
+    if epoch_length is None:
+        # `epoch_length` is the length of DDP-wrapped `train_dataloader`
+        epoch_length = get_dataloader_one_pass_outside_steps(
+            train_dataloader, num_processes
+        )
+    return epochs * epoch_length
+def get_dataloader_one_pass_steps_inside_accelerator(
+    dataloader_one_pass_steps: int, gradient_accumulation_steps: int,
+    num_processes: int
+):
+    """
+    Calculate the number of "visible" training steps for a single pass over the dataloader
+    inside an accelerator, accounting for gradient accumulation and distributed training.
+    Args:
+        dataloader_one_pass_steps:
+            The number of steps (batches) in one pass over the dataset.
+        gradient_accumulation_steps:
+            The number of steps to accumulate gradients before performing a parameter update.
+        num_processes:
+            The number of parallel processes used for distributed training.
+    Returns:
+        int: The total number of "visible" training steps for one pass over the dataset,
+             multiplied by the number of processes.
+    """
+    return math.ceil(
+        dataloader_one_pass_steps / gradient_accumulation_steps
+    ) * num_processes
+def get_steps_inside_accelerator_from_outside_steps(
+    outside_steps: int, dataloader_one_pass_outside_steps: int,
+    dataloader_one_pass_steps_inside_accelerator: int,
+    gradient_accumulation_steps: int, num_processes: int
+):
+    """
+    Convert "outside" steps (as observed in wandb logger or similar context)
+    to the corresponding number of "inside" steps (for accelerate lr scheduler).
+    Specifically, accelerate lr scheduler call `step()` `num_processes` times for
+    every `gradient_accumulation_steps` outside steps.
+    Args:
+        outside_steps:
+            The total number of steps counted outside accelerate context.
+        dataloader_one_pass_outside_steps:
+            The number of steps (batches) to complete one pass of the dataloader
+            outside accelerate.
+        dataloader_one_pass_steps_inside_accelerator:
+            The number of `lr_scheduler.step()` calls inside accelerate, calculated via
+            `get_dataloader_one_pass_steps_inside_accelerator`.
+        gradient_accumulation_steps:
+            The number of steps to accumulate gradients.
+        num_processes:
+            The number of parallel processes (GPUs) used in distributed training.
+    Returns:
+        int: The total number of `lr_scheduler.step()` calls inside accelerate that
+        correspond to the given `outside_steps`.
+    """
+    num_dataloader_epochs_passed = outside_steps // dataloader_one_pass_outside_steps
+    remaining_outside_steps = outside_steps % dataloader_one_pass_outside_steps
+    remaining_inside_accelerator_steps = (
+        remaining_outside_steps // gradient_accumulation_steps * num_processes
+    )
+    # accelerate scheduler call `step()` `num_processes` times every
+    # `gradient_accumulation_steps` steps:
+    # https://github.com/huggingface/accelerate/blob/main/src/accelerate/scheduler.py#L76
+    total_steps = (
+        num_dataloader_epochs_passed*
+        dataloader_one_pass_steps_inside_accelerator +
+        remaining_inside_accelerator_steps
+    )
+    return total_steps
+def lr_scheduler_param_adapter(
+    config_dict: dict[str, Any], num_training_steps: int, num_warmup_steps: int
+) -> dict[str, Any]:
+    target_class = config_dict["_target_"]
+    return_dict = copy.deepcopy(config_dict)
+    if target_class == "transformers.get_scheduler":
+        return_dict.update({
+            "num_training_steps": num_training_steps,
+            "num_warmup_steps": num_warmup_steps
+        })
+    return return_dict

utils/torch_utilities.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import logging
+import math
+from typing import Callable
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+logger = logging.Logger(__file__)
+def remove_key_prefix_factory(prefix: str = "module."):
+    def func(
+        model_dict: dict[str, torch.Tensor], state_dict: dict[str,
+                                                              torch.Tensor]
+    ) -> dict[str, torch.Tensor]:
+        state_dict = {
+            key[len(prefix):]: value
+            for key, value in state_dict.items() if key.startswith(prefix)
+        }
+        return state_dict
+    return func
+def merge_matched_keys(
+    model_dict: dict[str, torch.Tensor], state_dict: dict[str, torch.Tensor]
+) -> dict[str, torch.Tensor]:
+    """
+    Args:
+    model_dict:
+        The state dict of the current model, which is going to load pretrained parameters
+    state_dict:
+        A dictionary of parameters from a pre-trained model.
+    Returns:
+        dict[str, torch.Tensor]:
+            The updated state dict, where parameters with matched keys and shape are
+            updated with values in `state_dict`.
+    """
+    pretrained_dict = {}
+    mismatch_keys = []
+    for key, value in state_dict.items():
+        if key in model_dict and model_dict[key].shape == value.shape:
+            pretrained_dict[key] = value
+        else:
+            mismatch_keys.append(key)
+    logger.info(
+        f"Loading pre-trained model, with mismatched keys {mismatch_keys}"
+    )
+    model_dict.update(pretrained_dict)
+    return model_dict
+def load_pretrained_model(
+    model: nn.Module,
+    ckpt_or_state_dict: str | Path | dict[str, torch.Tensor],
+    state_dict_process_fn: Callable = merge_matched_keys
+) -> None:
+    state_dict = ckpt_or_state_dict
+    if not isinstance(state_dict, dict):
+        state_dict = torch.load(ckpt_or_state_dict, "cpu")
+    model_dict = model.state_dict()
+    state_dict = state_dict_process_fn(model_dict, state_dict)
+    model.load_state_dict(state_dict,strict=False)
+def create_mask_from_length(
+    lengths: torch.Tensor, max_length: int | None = None
+):
+    if max_length is None:
+        max_length = max(lengths)
+    idxs = torch.arange(max_length).reshape(1, -1)  # (1, max_length)
+    mask = idxs.to(lengths.device) < lengths.view(-1, 1)
+    # (1, max_length) < (batch_size, 1) -> (batch_size, max_length)
+    return mask
+def loss_with_mask(
+    loss: torch.Tensor,
+    mask: torch.Tensor,
+    reduce: bool = True
+) -> torch.Tensor:
+    """
+    Apply a mask to the loss tensor and optionally reduce it.
+    Args:
+        loss: Tensor of shape (b, t, ...) representing the loss values.
+        mask: Tensor of shape (b, t) where 1 indicates valid positions and 0 indicates masked positions.
+        reduce: If True, return a single scalar value; otherwise, return a tensor of shape (b,).
+    Returns:
+        torch.Tensor: A scalar if reduce is True, otherwise a tensor of shape (b,).
+    """
+    expanded_mask = mask[(..., ) + (None, ) * (loss.ndim - mask.ndim)]
+    expanded_mask = expanded_mask.expand_as(loss)
+    masked_loss = loss * expanded_mask
+    sum_dims = tuple(range(1, loss.ndim))
+    loss_sum = masked_loss.sum(dim=sum_dims)
+    mask_sum = expanded_mask.sum(dim=sum_dims)
+    loss = loss_sum / mask_sum
+    if reduce:
+        return loss.mean()
+    else:
+        return loss
+def convert_pad_shape(pad_shape: list[list[int]]):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+def create_alignment_path(duration: torch.Tensor, mask: torch.Tensor):
+    device = duration.device
+    b, t_x, t_y = mask.shape
+    cum_duration = torch.cumsum(duration, 1)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = create_mask_from_length(cum_duration_flat, t_y).float()
+    path = path.view(b, t_x, t_y)
+    # take the diff on the `t_x` axis
+    path = path - torch.nn.functional.pad(
+        path, convert_pad_shape([[0, 0], [1, 0], [0, 0]])
+    )[:, :-1]
+    path = path * mask
+    return path
+def trim_or_pad_length(x: torch.Tensor, target_length: int, length_dim: int):
+    """
+    Adjusts the size of the specified dimension of tensor x to match `target_length`.
+    Args:
+        x:
+            Input tensor.
+        target_length:
+            Desired size of the specified dimension.
+        length_dim:
+            The dimension to modify.
+    Returns:
+        torch.Tensor: The adjusted tensor.
+    """
+    current_length = x.shape[length_dim]
+    if current_length > target_length:
+        # Truncate the tensor
+        slices = [slice(None)] * x.ndim
+        slices[length_dim] = slice(0, target_length)
+        return x[tuple(slices)]
+    elif current_length < target_length:
+        # Pad the tensor with zeros
+        pad_shape = list(x.shape)
+        pad_length = target_length - current_length
+        pad_shape[length_dim] = pad_length  # Shape for left padding
+        padding = torch.zeros(pad_shape, dtype=x.dtype, device=x.device)
+        return torch.cat([x, padding], dim=length_dim)
+    return x
+def concat_non_padding(
+    seq1: torch.Tensor, mask1: torch.BoolTensor, seq2: torch.Tensor,
+    mask2: torch.BoolTensor
+):
+    """
+    Args
+        seq1 : Tensor (B, L1, E)
+            First sequence.
+        mask1 : BoolTensor (B, L1)
+            True for valid tokens in seq1, False for padding.
+        seq2 : Tensor (B, L2, E)
+            Second sequence.
+        mask2 : BoolTensor (B, L2)
+            True for valid tokens in seq2, False for padding.
+    Returns
+        concat_seq : Tensor (B, L1+L2, E)
+            Both sequences concatenated; valid tokens are left-aligned,
+            padding on the right is 0.
+        concat_mask: BoolTensor (B, L1+L2)
+            Mask for the concatenated sequence.
+        perm : LongTensor (B, L1+L2)
+            Permutation that maps **original indices → new indices**.
+            Needed for restoring the original sequences.
+    """
+    mask1, mask2 = mask1.bool(), mask2.bool()
+    B, L1, E = seq1.shape
+    L2 = seq2.size(1)
+    L = L1 + L2
+    seq_cat = torch.cat([seq1, seq2], dim=1)  # (B, L, E)
+    mask_cat = torch.cat([mask1, mask2], dim=1)  # (B, L)
+    # ----- Key step: stable sort so that all valid tokens move to the left -----
+    # Padding positions get +L, guaranteeing the largest “score” → sorted to the end.
+    positions = torch.arange(L, device=seq_cat.device).unsqueeze(0)  # (1, L)
+    sort_score = positions + (~mask_cat) * L
+    perm = sort_score.argsort(dim=1, stable=True)  # (B, L)
+    # Build concatenated sequence & mask
+    gather_idx = perm.unsqueeze(-1).expand(-1, -1, E)  # (B, L, E)
+    concat_seq = seq_cat.gather(1, gather_idx)
+    concat_mask = mask_cat.gather(1, perm)
+    # Explicitly zero out the right-hand padding region for safety
+    concat_seq = concat_seq * concat_mask.unsqueeze(-1)
+    return concat_seq, concat_mask, perm
+def restore_from_concat(
+    concat_seq: torch.Tensor, mask1: torch.BoolTensor, mask2: torch.BoolTensor,
+    perm: torch.LongTensor
+):
+    """
+    Restore (seq1, seq2) from the concatenated sequence produced by
+    `concat_non_padding`, using the returned permutation `perm`.
+    Fully vectorised — no Python loops.
+    """
+    mask1, mask2 = mask1.bool(), mask2.bool()
+    B, L1 = mask1.shape
+    L2 = mask2.size(1)
+    E = concat_seq.size(-1)
+    # Inverse permutation: maps **new_idx → old_idx**
+    inv_perm = torch.empty_like(perm)
+    inv_perm.scatter_(
+        1, perm,
+        torch.arange(L1 + L2, device=perm.device).unsqueeze(0).expand(B, -1)
+    )
+    # Bring tokens back to their original order
+    gather_idx = inv_perm.unsqueeze(-1).expand(-1, -1, E)
+    seq_cat_rec = concat_seq.gather(1, gather_idx)  # (B, L1+L2, E)
+    # Split back into the two sequences and mask out padding positions
+    seq1_restore, seq2_restore = seq_cat_rec.split([L1, L2], dim=1)
+    seq1_restore = seq1_restore * mask1.unsqueeze(-1)
+    seq2_restore = seq2_restore * mask2.unsqueeze(-1)
+    return seq1_restore, seq2_restore
+def contains_nan(data):
+    """check if data contains NaN"""
+    if isinstance(data, torch.Tensor):
+        return torch.isnan(data).any().item()
+    elif isinstance(data, np.ndarray):
+        return np.isnan(data).any()
+    elif isinstance(data, float):
+        return math.isnan(data)
+    elif isinstance(data, (list, tuple)):
+        return any(contains_nan(x) for x in data)
+    elif isinstance(data, dict):
+        return any(contains_nan(v) for v in data.values())
+    return False
+def check_nan_in_batch(batch):
+    """check if batch contains NaN and return nan audio ids"""
+    assert type(batch)==dict,"batch type error"
+    nan_audio_ids=[]
+    audio_ids=batch["audio_id"]
+    audio_id2content={}
+    for idx,audio_id in enumerate(audio_ids):
+        content=[]
+        for k,v in batch.items():
+            if k=="audio_id":
+                continue
+            content.append(v[idx])
+        audio_id2content[audio_id]=content
+    for audio_id,content in audio_id2content.items():
+        if contains_nan(content):
+            nan_audio_ids.append(audio_id)
+            print(f"{audio_id} contains NaN")
+    return nan_audio_ids