Spaces:

dalazymodder
/

Kanade_Tokenizer

Running

App Files Files Community

Dalzymodderever commited on 25 days ago

Commit

2cba492

1 Parent(s): a680a6c

Intial Commit

Browse files

Files changed (18) hide show

app.py +119 -0
requirements.txt +10 -0
src/kanade_tokenizer/__init__.py +11 -0
src/kanade_tokenizer/data/datamodule.py +146 -0
src/kanade_tokenizer/data/dataset.py +201 -0
src/kanade_tokenizer/model.py +500 -0
src/kanade_tokenizer/module/adaln_zero.py +68 -0
src/kanade_tokenizer/module/audio_feature.py +105 -0
src/kanade_tokenizer/module/convnext.py +125 -0
src/kanade_tokenizer/module/discriminator.py +78 -0
src/kanade_tokenizer/module/fsq.py +140 -0
src/kanade_tokenizer/module/global_encoder.py +75 -0
src/kanade_tokenizer/module/hift.py +685 -0
src/kanade_tokenizer/module/postnet.py +71 -0
src/kanade_tokenizer/module/ssl_extractor.py +106 -0
src/kanade_tokenizer/module/transformer.py +549 -0
src/kanade_tokenizer/pipeline.py +760 -0
src/kanade_tokenizer/util.py +106 -0

app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import sys
+import os
+import time
+import torch
+import gradio as gr
+# --- 1. PATH SETUP ---
+current_dir = os.path.dirname(os.path.abspath(__file__))
+src_path = os.path.join(current_dir, "src")
+if src_path not in sys.path:
+    sys.path.append(src_path)
+# --- 2. Imports ---
+try:
+    from kanade_tokenizer.model import KanadeModel
+    from kanade_tokenizer.util import load_vocoder, vocode, load_audio
+except ImportError as e:
+    print(f"❌ IMPORT ERROR: {e}")
+    raise e
+# --- Configuration ---
+KANADE_REPO = "frothywater/kanade-25hz-clean"
+KANADE_VOCODER = "hift"
+DEVICE = "cpu"
+SAMPLE_RATE = 24000
+MAX_AUDIO_SECONDS = 30  # Limit audio to 30 seconds
+print(f"🚀 Initializing on {DEVICE}...")
+# --- 3. Load Models ---
+print(f"📥 Loading Kanade...")
+kanade_model = KanadeModel.from_pretrained(repo_id=KANADE_REPO).to(DEVICE).eval()
+print(f"🔊 Loading HiFT Vocoder...")
+kanade_vocoder = load_vocoder(name=KANADE_VOCODER).to(DEVICE).eval()
+print("✅ Models Loaded.")
+# --- Core Inference ---
+def run_inference(source_wav, ref_wav):
+    """Run voice conversion inference on CPU"""
+    with torch.inference_mode():
+        mel_output = kanade_model.voice_conversion(source_wav, ref_wav)
+        generated_wav = vocode(kanade_vocoder, mel_output.unsqueeze(0))
+    return generated_wav
+# --- Main Handler ---
+def voice_conversion(source_path, reference_path):
+    if not source_path or not reference_path:
+        return None, "⚠️ Please provide both source and reference audio."
+    try:
+        # Load audio
+        source_wav = load_audio(source_path, sample_rate=SAMPLE_RATE).to(DEVICE)
+        ref_wav = load_audio(reference_path, sample_rate=SAMPLE_RATE).to(DEVICE)
+        # Check duration (30 second limit)
+        max_samples = MAX_AUDIO_SECONDS * SAMPLE_RATE
+        if source_wav.shape[-1] > max_samples:
+            source_wav = source_wav[..., :max_samples]
+        if ref_wav.shape[-1] > max_samples:
+            ref_wav = ref_wav[..., :max_samples]
+        # Run inference
+        start = time.time()
+        final_wav = run_inference(source_wav, ref_wav)
+        proc_time = time.time() - start
+        output_np = final_wav.squeeze().cpu().float().numpy()
+        output_duration = len(output_np) / SAMPLE_RATE
+        # RTF = processing time / audio duration (lower is better, <1 means faster than real-time)
+        rtf = proc_time / output_duration if output_duration > 0 else 0
+        return (SAMPLE_RATE, output_np), f"✅ {proc_time:.2f}s to convert {output_duration:.1f}s of audio | RTF: {rtf:.2f}x"
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, f"❌ Error: {str(e)}"
+# --- Gradio Interface ---
+with gr.Blocks(title="Kanade Voice Cloning") as demo:
+    gr.Markdown("""
+    # 🗣️ Kanade Voice Cloning
+    **Model:** `frothywater/kanade-25hz-clean`
+    Convert any audio into a target voice. Upload a source audio (what to say) and a reference audio (whose voice to use).
+    ⏱️ **Limit:** Audio is trimmed to 30 seconds max.
+    """)
+    with gr.Row():
+        with gr.Column():
+            source_audio = gr.Audio(label="Source Audio (Content - what to say)", type="filepath")
+            reference_audio = gr.Audio(label="Reference Audio (Target Voice - whose voice)", type="filepath")
+            convert_btn = gr.Button("🎤 Convert Voice", variant="primary")
+        with gr.Column():
+            output_audio = gr.Audio(label="Result")
+            status_text = gr.Textbox(label="Status", interactive=False)
+    convert_btn.click(
+        voice_conversion,
+        inputs=[source_audio, reference_audio],
+        outputs=[output_audio, status_text]
+    )
+    gr.Markdown("""
+    ---
+    **Tips:**
+    - For best results, use clean reference audio (3-10 seconds of clear speech)
+    - Source and reference should ideally be similar in speaking pace
+    """)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+huggingface_hub
+jsonargparse[signatures]
+numpy
+safetensors
+soundfile
+torch
+torchaudio
+tqdm
+vocos
+gradio

src/kanade_tokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from .model import KanadeFeatures, KanadeModel, KanadeModelConfig
+from .util import load_audio, load_vocoder, vocode
+__all__ = [
+    "KanadeModel",
+    "KanadeModelConfig",
+    "KanadeFeatures",
+    "load_audio",
+    "load_vocoder",
+    "vocode",
+]

src/kanade_tokenizer/data/datamodule.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from dataclasses import dataclass
+from pathlib import Path
+import lightning as L
+import torch
+from torch.utils.data import DataLoader, Dataset
+from ..util import get_logger
+from .dataset import AudioItem, ChunkedAudioDataset, pad_audio
+logger = get_logger()
+@dataclass
+class AudioBatch:
+    waveform: torch.Tensor  # [batch, channels, samples]
+    audio_ids: list[str]
+    paths: list[Path]
+    sample_rates: list[int]
+    frame_offsets: list[int] | None  # For chunked audio
+@dataclass
+class AudioDataConfig:
+    csv_path: str
+    audio_root: str
+    # Audio processing
+    sample_rate: int | None = 16000
+    mono: bool = True
+    normalize: bool = True
+    # Chunking options
+    chunk_size: int | None = None
+    chunk_hop_size: int | None = None
+    # DataLoader options
+    batch_size: int = 32
+    num_workers: int = 4
+    pin_memory: bool = False
+    persistent_workers: bool = False
+    shuffle: bool = False
+    drop_last: bool = False
+def audio_collate_fn(batch: list[AudioItem]) -> AudioBatch:
+    waveforms = [item.waveform for item in batch]
+    # Pad all waveforms to max length
+    max_length = max(wave.shape[1] for wave in waveforms)
+    if any(wave.shape[1] != max_length for wave in waveforms):
+        waveforms = [pad_audio(wave, max_length) for wave in waveforms]
+    return AudioBatch(
+        waveform=torch.stack(waveforms),
+        audio_ids=[item.audio_id for item in batch],
+        paths=[item.path for item in batch],
+        sample_rates=[item.sample_rate for item in batch],
+        frame_offsets=[item.frame_offset for item in batch],
+    )
+class AudioDataModule(L.LightningDataModule):
+    def __init__(
+        self,
+        train_config: AudioDataConfig,
+        val_config: AudioDataConfig | None = None,
+        test_config: AudioDataConfig | None = None,
+    ):
+        super().__init__()
+        self.train_config = train_config
+        self.val_config = val_config or train_config
+        self.test_config = test_config or self.val_config
+        # Set to be initialized in setup()
+        self.train_dataset: Dataset | None = None
+        self.val_dataset: Dataset | None = None
+        self.test_dataset: Dataset | None = None
+    def _create_dataset(self, config: AudioDataConfig) -> Dataset:
+        return ChunkedAudioDataset(
+            csv_path=config.csv_path,
+            audio_root=config.audio_root,
+            chunk_size=config.chunk_size,
+            hop_size=config.chunk_hop_size,
+            mono=config.mono,
+            normalize=config.normalize,
+            target_sample_rate=config.sample_rate,
+        )
+    def setup(self, stage: str | None = None):
+        if stage == "fit" or stage is None:
+            self.train_dataset = self._create_dataset(self.train_config)
+            self.val_dataset = self._create_dataset(self.val_config)
+        elif stage == "validate":
+            self.val_dataset = self._create_dataset(self.val_config)
+        elif stage == "test" or stage == "predict":
+            self.test_dataset = self._create_dataset(self.test_config)
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.train_config.batch_size,
+            num_workers=self.train_config.num_workers,
+            pin_memory=self.train_config.pin_memory,
+            persistent_workers=self.train_config.persistent_workers if self.train_config.num_workers > 0 else False,
+            shuffle=self.train_config.shuffle,
+            drop_last=self.train_config.drop_last,
+            collate_fn=audio_collate_fn,
+        )
+    def val_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.val_config.batch_size,
+            num_workers=self.val_config.num_workers,
+            pin_memory=self.val_config.pin_memory,
+            persistent_workers=self.val_config.persistent_workers if self.val_config.num_workers > 0 else False,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=audio_collate_fn,
+        )
+    def test_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.test_config.batch_size,
+            num_workers=self.test_config.num_workers,
+            pin_memory=self.test_config.pin_memory,
+            persistent_workers=self.test_config.persistent_workers if self.test_config.num_workers > 0 else False,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=audio_collate_fn,
+        )
+    def predict_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.test_config.batch_size,
+            num_workers=self.test_config.num_workers,
+            pin_memory=self.test_config.pin_memory,
+            persistent_workers=self.test_config.persistent_workers if self.test_config.num_workers > 0 else False,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=audio_collate_fn,
+        )

src/kanade_tokenizer/data/dataset.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import csv
+from dataclasses import dataclass
+from pathlib import Path
+import torch
+import torchaudio
+from torch.utils.data import Dataset
+from ..util import _load_audio_internal, get_logger
+logger = get_logger()
+@dataclass
+class AudioItem:
+    waveform: torch.Tensor
+    audio_id: str
+    path: Path
+    sample_rate: int
+    frame_offset: int | None = None  # For chunked audio
+def convert_to_mono(waveform: torch.Tensor) -> torch.Tensor:
+    # (1, samples)
+    if waveform.shape[0] > 1:
+        return torch.mean(waveform, dim=0, keepdim=True)
+    return waveform
+def resample_audio(waveform: torch.Tensor, orig_freq: int, new_freq: int) -> torch.Tensor:
+    if orig_freq != new_freq:
+        resampler = torchaudio.transforms.Resample(orig_freq=orig_freq, new_freq=new_freq)
+        return resampler(waveform)
+    return waveform
+def normalize_audio(waveform: torch.Tensor) -> torch.Tensor:
+    max_val = torch.max(torch.abs(waveform)) + 1e-8
+    return waveform / max_val
+def preprocess_audio(
+    waveform: torch.Tensor, sample_rate: int, mono: bool, normalize: bool, target_sample_rate: int | None = None
+) -> tuple[torch.Tensor, int]:
+    # Convert to mono if needed
+    if mono:
+        waveform = convert_to_mono(waveform)
+    # Resample if needed
+    if target_sample_rate is not None and sample_rate != target_sample_rate:
+        waveform = resample_audio(waveform, sample_rate, target_sample_rate)
+        sample_rate = target_sample_rate
+    # Normalize if needed
+    if normalize:
+        waveform = normalize_audio(waveform)
+    return waveform, sample_rate
+def pad_audio(waveform: torch.Tensor, target_length: int) -> torch.Tensor:
+    current_length = waveform.shape[1]
+    if current_length >= target_length:
+        return waveform
+    # Calculate padding needed
+    pad_length = target_length - current_length
+    # Pad with zeros at the end
+    padding = torch.zeros((waveform.shape[0], pad_length), dtype=waveform.dtype, device=waveform.device)
+    padded_waveform = torch.cat([waveform, padding], dim=1)
+    return padded_waveform
+@dataclass
+class ChunkInfo:
+    audio_id: str
+    frame_offset: int  # In target sample rate
+    num_frames: int  # In target sample rate
+class ChunkedAudioDataset(Dataset):
+    """
+    Dataset that loads audio from CSV with optional chunking.
+    Args:
+        csv_path: Path to the CSV file with columns: audio_id, path, length, sample_rate
+        audio_root: Root directory for audio files (prepended to paths in CSV)
+        chunk_size: Size of each chunk in frames (None = no chunking)
+        hop_size: Hop size between chunks in frames (None = use chunk_size)
+        mono: Convert to mono if True
+        normalize: Normalize audio if True
+        target_sample_rate: Resample to this sample rate if provided
+    """
+    def __init__(
+        self,
+        csv_path: str,
+        audio_root: str,
+        chunk_size: int | None = None,
+        hop_size: int | None = None,
+        mono: bool = True,
+        normalize: bool = True,
+        target_sample_rate: int | None = None,
+    ):
+        self.csv_path = csv_path
+        self.audio_root = audio_root
+        self.chunk_size = chunk_size
+        self.hop_size = hop_size if hop_size is not None else chunk_size
+        self.mono = mono
+        self.normalize = normalize
+        self.target_sample_rate = target_sample_rate
+        # Load CSV and compute chunks
+        self.file_entries = self._load_csv()
+        self.chunks = self._compute_chunks()
+        logger.info(f"Loaded dataset from {csv_path}: {len(self.file_entries)} files, {len(self.chunks)} chunks")
+    def _load_csv(self) -> dict[str, dict]:
+        """Load audio metadata from CSV."""
+        entries = {}
+        with open(self.csv_path, "r", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                entries[row["audio_id"]] = {
+                    "path": row["path"],
+                    "length": int(row["length"]),
+                    "sample_rate": int(row["sample_rate"]),
+                }
+        return entries
+    def _compute_chunks(self) -> list[ChunkInfo]:
+        """Compute all chunks from the file entries."""
+        chunks = []
+        for audio_id, entry in self.file_entries.items():
+            length = entry["length"]
+            sample_rate = entry["sample_rate"]
+            # Adjust length if resampling to target sample rate
+            if self.target_sample_rate is not None and sample_rate != self.target_sample_rate:
+                length = int(length * self.target_sample_rate / sample_rate)
+                sample_rate = self.target_sample_rate
+            if self.chunk_size is None or length <= self.chunk_size:
+                # No chunking, or file is shorter than chunk size: use entire file
+                chunks.append(ChunkInfo(audio_id=audio_id, frame_offset=0, num_frames=length))
+            else:
+                # Chunking: compute all chunks with last chunk aligned to end
+                frame_offset = 0
+                while frame_offset + self.chunk_size <= length:
+                    chunks.append(ChunkInfo(audio_id=audio_id, frame_offset=frame_offset, num_frames=self.chunk_size))
+                    frame_offset += self.hop_size
+                # Add the last chunk aligned to the end
+                last_start = length - self.chunk_size
+                if last_start > frame_offset - self.hop_size:
+                    chunks.append(ChunkInfo(audio_id=audio_id, frame_offset=last_start, num_frames=self.chunk_size))
+        return chunks
+    def __len__(self) -> int:
+        return len(self.chunks)
+    def __getitem__(self, idx: int) -> AudioItem:
+        """Load and return a single audio chunk."""
+        chunk = self.chunks[idx]
+        entry = self.file_entries[chunk.audio_id]
+        orig_sample_rate = entry["sample_rate"]
+        full_path = Path(self.audio_root) / entry["path"]
+        # Calculate start frame and num frames in original sample rate
+        if self.target_sample_rate is not None and orig_sample_rate != self.target_sample_rate:
+            orig_frame_offset = int(chunk.frame_offset * orig_sample_rate / self.target_sample_rate)
+            orig_num_frames = int(chunk.num_frames * orig_sample_rate / self.target_sample_rate)
+        else:
+            orig_frame_offset = chunk.frame_offset
+            orig_num_frames = chunk.num_frames
+        waveform, sample_rate = _load_audio_internal(
+            full_path, frame_offset=orig_frame_offset, num_frames=orig_num_frames
+        )
+        waveform, sample_rate = preprocess_audio(
+            waveform=waveform,
+            sample_rate=sample_rate,
+            mono=self.mono,
+            normalize=self.normalize,
+            target_sample_rate=self.target_sample_rate,
+        )
+        # Pad if necessary (in case file is shorter than expected)
+        if self.chunk_size is not None and waveform.shape[1] < self.chunk_size:
+            waveform = pad_audio(waveform, self.chunk_size)
+        return AudioItem(
+            waveform=waveform,
+            audio_id=chunk.audio_id,
+            path=full_path,
+            sample_rate=sample_rate,
+            frame_offset=chunk.frame_offset,
+        )

src/kanade_tokenizer/model.py ADDED Viewed

	@@ -0,0 +1,500 @@

+import math
+from dataclasses import dataclass
+from typing import Literal
+import jsonargparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .module.fsq import FiniteScalarQuantizer
+from .module.global_encoder import GlobalEncoder
+from .module.postnet import PostNet
+from .module.ssl_extractor import SSLFeatureExtractor
+from .module.transformer import Transformer
+from .util import freeze_modules, get_logger
+logger = get_logger()
+@dataclass
+class KanadeModelConfig:
+    # SSL Feature settings
+    local_ssl_layers: tuple[int, ...] = (6, 9)  # Indices of SSL layers for local branch
+    global_ssl_layers: tuple[int, ...] = (1, 2)  # Indices of SSL layers for global branch
+    normalize_ssl_features: bool = True  # Whether to normalize local SSL features before encoding
+    # Down/up-sampling settings
+    downsample_factor: int = 2  # Temporal downsampling factor for local features
+    mel_upsample_factor: int = 4  # Conv1DTranspose upsampling factor for mel features before interpolation
+    use_conv_downsample: bool = True  # Whether to use Conv1D for downsampling instead average pooling
+    local_interpolation_mode: str = "linear"  # Interpolation mode for local upsampling ("linear", "nearest")
+    mel_interpolation_mode: str = "linear"  # Interpolation mode for mel upsampling ("linear", "nearest")
+    # Mel spectrogram settings
+    sample_rate: int = 24000
+    n_fft: int = 1024
+    hop_length: int = 256
+    n_mels: int = 100
+    padding: str = "center"
+    mel_fmin: int = 0  # Minimum frequency for mel spectrograms
+    mel_fmax: int | None = None  # Maximum frequency for mel spectrograms
+    bigvgan_style_mel: bool = False  # Whether to use BigVGAN-style mel spectrograms
+    # Vocoder settings
+    vocoder_name: Literal["vocos", "hift"] = "vocos"  # Vocoder to use for waveform synthesis
+@dataclass
+class KanadeFeatures:
+    content_embedding: torch.Tensor | None = None  # (seq_len, dim)
+    content_token_indices: torch.Tensor | None = None  # (seq_len,)
+    global_embedding: torch.Tensor | None = None  # (dim,)
+class KanadeModel(nn.Module):
+    """Model architecture and forward pass logic for Kanade tokenizer."""
+    def __init__(
+        self,
+        config: KanadeModelConfig,
+        ssl_feature_extractor: SSLFeatureExtractor,
+        local_encoder: Transformer,
+        local_quantizer: FiniteScalarQuantizer,
+        global_encoder: GlobalEncoder,
+        mel_prenet: Transformer,
+        mel_decoder: Transformer,
+        mel_postnet: PostNet,
+        feature_decoder: Transformer | None = None,
+    ):
+        super().__init__()
+        self.config = config
+        self._init_ssl_extractor(config, ssl_feature_extractor)
+        self._init_local_branch(config, local_encoder, local_quantizer, feature_decoder)
+        self._init_global_branch(global_encoder)
+        self._init_mel_decoder(config, mel_prenet, mel_decoder, mel_postnet)
+    def _init_ssl_extractor(self, config: KanadeModelConfig, ssl_feature_extractor: SSLFeatureExtractor):
+        """Initialize and configure SSL feature extractor."""
+        self.ssl_feature_extractor = ssl_feature_extractor
+        freeze_modules([self.ssl_feature_extractor])
+        logger.debug(
+            f"SSL feature extractor initialized and frozen, feature dim: {self.ssl_feature_extractor.feature_dim}"
+        )
+        # Configure local SSL layers
+        self.local_ssl_layers = list(config.local_ssl_layers)
+        if len(self.local_ssl_layers) > 1:
+            logger.debug(
+                f"Using average of {len(self.local_ssl_layers)} SSL layers for local branch: {self.local_ssl_layers}"
+            )
+        else:
+            logger.debug(f"Using single SSL layer {self.local_ssl_layers[0]} for local branch")
+        if config.normalize_ssl_features:
+            logger.debug("Normalizing local SSL features before encoding")
+        # Configure global SSL layers
+        self.global_ssl_layers = list(config.global_ssl_layers)
+        if len(self.global_ssl_layers) > 1:
+            logger.debug(
+                f"Using average of {len(self.global_ssl_layers)} SSL layers for global branch: {self.global_ssl_layers}"
+            )
+        else:
+            logger.debug(f"Using single SSL layer {self.global_ssl_layers[0]} for global branch")
+    def _init_local_branch(
+        self,
+        config: KanadeModelConfig,
+        local_encoder: Transformer,
+        local_quantizer: FiniteScalarQuantizer,
+        feature_decoder: Transformer | None,
+    ):
+        """Initialize local branch components (encoder, downsampling, quantizer, decoder)."""
+        self.local_encoder = local_encoder
+        self.local_quantizer = local_quantizer
+        self.feature_decoder = feature_decoder
+        # Configure downsampling
+        self.downsample_factor = config.downsample_factor
+        if self.downsample_factor > 1:
+            logger.debug(f"Using temporal downsampling with factor {self.downsample_factor}")
+            if config.use_conv_downsample:
+                # Create Conv1d layers for downsampling and upsampling local embeddings
+                feature_dim = local_encoder.output_dim
+                self.conv_downsample = nn.Conv1d(
+                    feature_dim, feature_dim, kernel_size=config.downsample_factor, stride=config.downsample_factor
+                )
+                self.conv_upsample = nn.ConvTranspose1d(
+                    feature_dim, feature_dim, kernel_size=config.downsample_factor, stride=config.downsample_factor
+                )  # won't be used unless training feature reconstruction
+                logger.debug(f"Using Conv1d downsampling/upsampling with kernel size {config.downsample_factor}")
+            else:
+                self.conv_downsample = None
+                self.conv_upsample = None
+                logger.debug("Using average pooling and linear interpolation for downsampling/upsampling")
+        else:
+            self.conv_downsample = None
+            self.conv_upsample = None
+    def _init_global_branch(self, global_encoder: GlobalEncoder):
+        """Initialize global branch components."""
+        self.global_encoder = global_encoder
+    def _init_mel_decoder(
+        self, config: KanadeModelConfig, mel_prenet: Transformer, mel_decoder: Transformer, mel_postnet: PostNet
+    ):
+        """Initialize mel decoder components (prenet, upsampling, decoder, postnet)."""
+        self.mel_prenet = mel_prenet
+        self.mel_decoder = mel_decoder
+        self.mel_postnet = mel_postnet
+        # Configure mel upsampling
+        self.mel_conv_upsample = None
+        if config.mel_upsample_factor > 1:
+            # Create Conv1DTranspose layer for mel upsampling
+            input_dim = mel_prenet.output_dim
+            self.mel_conv_upsample = nn.ConvTranspose1d(
+                input_dim, input_dim, kernel_size=config.mel_upsample_factor, stride=config.mel_upsample_factor
+            )
+            logger.debug(f"Using Conv1DTranspose for mel upsampling with factor {config.mel_upsample_factor}")
+    def _calculate_waveform_padding(self, audio_length: int, ensure_recon_length: bool = False) -> int:
+        """Calculate required padding for input waveform to ensure consistent SSL feature lengths."""
+        extractor = self.ssl_feature_extractor
+        sample_rate = self.config.sample_rate
+        # SSL may resample the input to its own sample rate, so calculate the number of samples after resampling
+        num_samples_after_resampling = audio_length / sample_rate * extractor.ssl_sample_rate
+        # We expect the SSL feature extractor to be consistent with its hop size
+        expected_ssl_output_length = math.ceil(num_samples_after_resampling / extractor.hop_size)
+        # If ensure_recon_length is True, we want to make sure the output length is exactly divisible by downsample factor
+        if ensure_recon_length and (remainder := expected_ssl_output_length % self.downsample_factor) != 0:
+            expected_ssl_output_length += self.downsample_factor - remainder
+        # But it may require more input samples to produce that output length, so calculate the required input length
+        num_samples_required_after_resampling = extractor.get_minimum_input_length(expected_ssl_output_length)
+        # That number of samples is at the SSL sample rate, so convert back to our original sample rate
+        num_samples_required = num_samples_required_after_resampling / extractor.ssl_sample_rate * sample_rate
+        # Calculate padding needed on each side
+        padding = math.ceil((num_samples_required - audio_length) / 2)
+        return padding
+    def _calculate_original_audio_length(self, token_length: int) -> int:
+        """Calculate the original audio length based on token length."""
+        extractor = self.ssl_feature_extractor
+        sample_rate = self.config.sample_rate
+        # Calculate the feature length before downsampling
+        feature_length = token_length * self.downsample_factor
+        num_samples_required_after_resampling = extractor.get_minimum_input_length(feature_length)
+        num_samples_required = num_samples_required_after_resampling / extractor.ssl_sample_rate * sample_rate
+        return math.ceil(num_samples_required)
+    def _calculate_target_mel_length(self, audio_length: int) -> int:
+        """Calculate the target mel spectrogram length based on audio length."""
+        if self.config.padding == "center":
+            return audio_length // self.config.hop_length + 1
+        elif self.config.padding == "same":
+            return audio_length // self.config.hop_length
+        else:
+            return (audio_length - self.config.n_fft) // self.config.hop_length + 1
+    def _process_ssl_features(self, features: list[torch.Tensor], layers: list[int]) -> torch.Tensor:
+        if len(layers) > 1:
+            # Get features from multiple layers and average them
+            selected_features = [features[i - 1] for i in layers]
+            mixed_features = torch.stack(selected_features, dim=0).mean(dim=0)
+        else:
+            # Just take the single specified layer
+            mixed_features = features[layers[0] - 1]
+        return mixed_features
+    def _normalize_ssl_features(self, features: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
+        if not self.config.normalize_ssl_features:
+            return features
+        # Compute mean and std across time steps for each sample and feature dimension
+        mean = torch.mean(features, dim=1, keepdim=True)  # (B, 1, C)
+        std = torch.std(features, dim=1, keepdim=True)  # (B, 1, C)
+        return (features - mean) / (std + eps)
+    def forward_ssl_features(
+        self, waveform: torch.Tensor, padding: int | None = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass to extract SSL features. (B, T, C)
+        Args:
+            waveform: Input waveform tensor of shape (B, channels, samples)
+            padding: Optional padding to apply on both sides of the waveform. This is useful to ensure
+                     that the SSL feature extractor produces consistent output lengths.
+        Returns:
+            local_ssl_features: Local SSL features for local branch. (B, T, C)
+            global_ssl_features: Global SSL features for global branch. (B, T, C)
+        """
+        # Prepare input waveform
+        if waveform.dim() == 3:
+            waveform = waveform.squeeze(1)
+        # 1. Extract SSL features
+        if padding > 0:
+            waveform = F.pad(waveform, (padding, padding), mode="constant")
+        with torch.no_grad():
+            ssl_features = self.ssl_feature_extractor(waveform)
+        local_ssl_features = self._process_ssl_features(ssl_features, self.local_ssl_layers)
+        local_ssl_features = self._normalize_ssl_features(local_ssl_features)
+        global_ssl_features = self._process_ssl_features(ssl_features, self.global_ssl_layers)
+        return local_ssl_features, global_ssl_features
+    def forward_content(
+        self, local_ssl_features: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] | None:
+        """Forward pass to extract content embeddings from the local branch.
+        Args:
+            local_ssl_features: Local SSL features tensor of shape (B, T, C)
+        Returns:
+            local_quantized: Quantized local embeddings. (B, T/factor, C)
+            indices: Content token indices. (B, T/factor)
+            ssl_recon: Reconstructed SSL features (if feature decoder is present). (B, T, C)
+            perplexity: Quantizer perplexity (if feature decoder is present). Scalar tensor.
+        """
+        local_encoded = self.local_encoder(local_ssl_features)
+        # Downsample temporally if needed: (B, T, C) -> (B, T/factor, C)
+        if self.downsample_factor > 1:
+            if self.config.use_conv_downsample:
+                local_encoded = self.conv_downsample(local_encoded.transpose(1, 2)).transpose(1, 2)
+            else:
+                local_encoded = F.avg_pool1d(
+                    local_encoded.transpose(1, 2), kernel_size=self.downsample_factor, stride=self.downsample_factor
+                ).transpose(1, 2)
+        # If training feature reconstruction, decode local embeddings
+        ssl_recon = None
+        perplexity = torch.tensor(0.0)
+        if self.feature_decoder is not None:
+            local_quantized, local_quantize_info = self.local_quantizer(local_encoded)
+            indices = local_quantize_info["indices"]
+            perplexity = torch.mean(local_quantize_info["perplexity"])
+            local_latent_for_ssl = local_quantized
+            # Upsample if needed
+            if self.downsample_factor > 1:
+                if self.config.use_conv_downsample:
+                    # Use conv transpose for upsampling: (B, T/factor, C) -> (B, C, T/factor) -> conv -> (B, C, T) -> (B, T, C)
+                    local_latent_for_ssl = self.conv_upsample(local_latent_for_ssl.transpose(1, 2)).transpose(1, 2)
+                else:
+                    # (B, T/factor, C) -> (B, T, C)
+                    local_latent_for_ssl = F.interpolate(
+                        local_latent_for_ssl.transpose(1, 2),
+                        size=local_ssl_features.shape[1],
+                        mode=self.config.local_interpolation_mode,
+                    ).transpose(1, 2)
+            ssl_recon = self.feature_decoder(local_latent_for_ssl)
+        else:
+            # If not training feature reconstruction, just get quantized local embeddings
+            local_quantized, indices = self.local_quantizer.encode(local_encoded)
+        return local_quantized, indices, ssl_recon, perplexity
+    def forward_global(self, global_ssl_features: torch.Tensor) -> torch.Tensor:
+        """Forward pass to extract global embeddings from the global branch.
+        Args:
+            global_ssl_features: Global SSL features tensor of shape (B, T, C)
+        Returns:
+            global_encoded: Global embeddings. (B, C)
+        """
+        global_encoded = self.global_encoder(global_ssl_features)
+        return global_encoded
+    def forward_mel(
+        self, content_embeddings: torch.Tensor, global_embeddings: torch.Tensor, mel_length: int
+    ) -> torch.Tensor:
+        """Forward pass to generate mel spectrogram from content and global embeddings.
+        Args:
+            content_embeddings: Content embeddings tensor of shape (B, T, C)
+            global_embeddings: Global embeddings tensor of shape (B, C)
+            mel_length: Target mel spectrogram length (T_mel)
+        Returns:
+            mel_recon: Reconstructed mel spectrogram tensor of shape (B, n_mels, T_mel)
+        """
+        local_latent = self.mel_prenet(content_embeddings)
+        # Upsample local latent to match mel spectrogram length
+        # First use Conv1DTranspose if configured
+        if self.mel_conv_upsample is not None:
+            # (B, T/factor, C) -> (B, C, T/factor) -> conv -> (B, C, T*upsample_factor) -> (B, T*upsample_factor, C)
+            local_latent = self.mel_conv_upsample(local_latent.transpose(1, 2)).transpose(1, 2)
+        local_latent = F.interpolate(
+            local_latent.transpose(1, 2), size=mel_length, mode=self.config.mel_interpolation_mode
+        ).transpose(1, 2)  # (B, T_current, C) -> (B, T_mel, C)
+        # Generate mel spectrogram, conditioned on global embeddings
+        mel_recon = self.mel_decoder(local_latent, condition=global_embeddings.unsqueeze(1))
+        mel_recon = mel_recon.transpose(1, 2)  # (B, n_mels, T)
+        mel_recon = self.mel_postnet(mel_recon)
+        return mel_recon
+    # ======== Inference methods ========
+    def weights_to_save(self, *, include_modules: list[str]) -> dict[str, torch.Tensor]:
+        """Get model weights for saving. Excludes certain modules not needed for inference."""
+        excluded_modules = [
+            m for m in ["ssl_feature_extractor", "feature_decoder", "conv_upsample"] if m not in include_modules
+        ]
+        state_dict = {
+            name: param
+            for name, param in self.named_parameters()
+            if not any(name.startswith(excl) for excl in excluded_modules)
+        }
+        return state_dict
+    @classmethod
+    def from_hparams(cls, config_path: str) -> "KanadeModel":
+        """Instantiate KanadeModel from config file.
+        Args:
+            config_path (str): Path to model configuration file (.yaml).
+        Returns:
+            KanadeModel: Instantiated KanadeModel.
+        """
+        parser = jsonargparse.ArgumentParser(exit_on_error=False)
+        parser.add_argument("--model", type=KanadeModel)
+        cfg = parser.parse_path(config_path)
+        cfg = parser.instantiate_classes(cfg)
+        return cfg.model
+    @classmethod
+    def from_pretrained(
+        cls,
+        repo_id: str | None = None,
+        revision: str | None = None,
+        config_path: str | None = None,
+        weights_path: str | None = None,
+    ) -> "KanadeModel":
+        """Load KanadeModel either from HuggingFace Hub or local config and weights files.
+        Args:
+            repo_id (str, optional): HuggingFace Hub repository ID. If provided, loads config and weights from the hub.
+            revision (str, optional): Revision (branch, tag, commit) for the HuggingFace Hub repo.
+            config_path (str, optional): Path to model configuration file (.yaml). Required if repo_id is not provided.
+            weights_path (str, optional): Path to model weights file (.safetensors). Required if repo_id is not provided.
+        Returns:
+            KanadeModel: Loaded KanadeModel instance.
+        """
+        if repo_id is not None:
+            # Load from HuggingFace Hub
+            from huggingface_hub import hf_hub_download
+            config_path = hf_hub_download(repo_id, "config.yaml", revision=revision)
+            weights_path = hf_hub_download(repo_id, "model.safetensors", revision=revision)
+        else:
+            # Check local paths
+            if config_path is None or weights_path is None:
+                raise ValueError(
+                    "Please provide either HuggingFace Hub repo_id or both config_path and weights_path for model loading."
+                )
+        # Load model from config
+        model = cls.from_hparams(config_path)
+        # Load weights
+        from safetensors.torch import load_file
+        state_dict = load_file(weights_path, device="cpu")
+        model.load_state_dict(state_dict, strict=False)
+        logger.info(f"Loaded weights from safetensors file: {weights_path}")
+        return model
+    @torch.inference_mode()
+    def encode(self, waveform: torch.Tensor, return_content: bool = True, return_global: bool = True) -> KanadeFeatures:
+        """Extract content and/or global features from audio using Kanade model.
+        Args:
+            waveform (torch.Tensor): Input audio waveform tensor (samples,). The sample rate should match model config.
+            return_content (bool): Whether to extract content features.
+            return_global (bool): Whether to extract global features.
+        Returns:
+            dict[str, torch.Tensor]: Extracted features.
+        """
+        audio_length = waveform.size(0)
+        padding = self._calculate_waveform_padding(audio_length)
+        local_ssl_features, global_ssl_features = self.forward_ssl_features(waveform.unsqueeze(0), padding=padding)
+        result = KanadeFeatures()
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
+            if return_content:
+                content_embedding, token_indices, _, _ = self.forward_content(local_ssl_features)
+                result.content_embedding = content_embedding.squeeze(0)  # (seq_len, dim)
+                result.content_token_indices = token_indices.squeeze(0)  # (seq_len,)
+            if return_global:
+                global_embedding = self.forward_global(global_ssl_features)
+                result.global_embedding = global_embedding.squeeze(0)  # (dim,)
+        return result
+    def decode_token_indices(self, indices: torch.Tensor) -> torch.Tensor:
+        """Get content embeddings from content token indices. (..., seq_len) -> (..., seq_len, dim)"""
+        content_embedding = self.local_quantizer.decode(indices)
+        return content_embedding
+    @torch.inference_mode()
+    def decode(
+        self,
+        global_embedding: torch.Tensor,
+        content_token_indices: torch.Tensor | None = None,
+        content_embedding: torch.Tensor | None = None,
+        target_audio_length: int | None = None,
+    ) -> torch.Tensor:
+        """Synthesize audio from content and global features using Kanade model and Vocos.
+        Args:
+            global_embedding (torch.Tensor): Global embedding tensor (dim,).
+            content_token_indices (torch.Tensor, optional): Optional content token indices tensor (seq_len).
+            content_embedding (torch.Tensor, optional): Optional content embedding tensor (seq_len, dim).
+                If both content_token_indices and content_embedding are provided, content_embedding takes precedence.
+            target_audio_length (int, optional): Target length of the output audio in samples.
+                If None, uses the original audio length estimated from the sequence length of content tokens.
+        Returns:
+            torch.Tensor: Generated mel spectrogram tensor (n_mels, T).
+        """
+        # Obtain content embedding if not provided
+        if content_embedding is None:
+            if content_token_indices is None:
+                raise ValueError("Either content_token_indices or content_embedding must be provided.")
+            content_embedding = self.decode_token_indices(content_token_indices)
+        if target_audio_length is None:
+            # Estimate original audio length from content token sequence length
+            seq_len = content_embedding.size(0)
+            target_audio_length = self._calculate_original_audio_length(seq_len)
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
+            mel_length = self._calculate_target_mel_length(target_audio_length)
+            content_embedding = content_embedding.unsqueeze(0)  # (1, seq_len, dim)
+            global_embedding = global_embedding.unsqueeze(0)  # (1, dim)
+            mel_spectrogram = self.forward_mel(content_embedding, global_embedding, mel_length=mel_length)
+        return mel_spectrogram.squeeze(0)  # (n_mels, T)
+    @torch.inference_mode()
+    def voice_conversion(self, source_waveform: torch.Tensor, reference_waveform: torch.Tensor) -> torch.Tensor:
+        """Convert voice using Kanade model and Vocos, keeping content from source and global characteristics from reference.
+        Only supports single audio input. Just a convenient wrapper around encode and decode methods.
+        Args:
+            source_waveform (torch.Tensor): Source audio waveform tensor (samples,).
+            reference_waveform (torch.Tensor): Reference audio waveform tensor (samples_ref,).
+        Returns:
+            torch.Tensor: Converted mel spectrogram tensor (n_mels, T).
+        """
+        # Extract source content features and reference global features
+        source_features = self.encode(source_waveform, return_content=True, return_global=False)
+        reference_features = self.encode(reference_waveform, return_content=False, return_global=True)
+        # Synthesize mel spectrogram using source content and reference global features
+        mel_spectrogram = self.decode(
+            content_embedding=source_features.content_embedding,
+            global_embedding=reference_features.global_embedding,
+            target_audio_length=source_waveform.size(0),
+        )
+        return mel_spectrogram

src/kanade_tokenizer/module/adaln_zero.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Adapted from: https://github.com/facebookresearch/DiT
+import torch
+from torch import nn
+class AdaLNZero(nn.Module):
+    """
+    Adaptive Layer Normalization Zero (AdaLNZero) module.
+    Combines LayerNorm with adaptive conditioning to produce shift, scale, and gate values.
+    The gate is used to scale features before residual connection.
+    Args:
+        dim: Feature dimension
+        condition_dim: Conditioning dimension
+        eps: LayerNorm epsilon
+        return_gate: If True, returns gate value for scaling.
+    """
+    def __init__(
+        self,
+        dim: int,
+        condition_dim: int,
+        eps: float = 1e-5,
+        return_gate: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.condition_dim = condition_dim
+        self.return_gate = return_gate
+        # LayerNorm without learnable parameters
+        self.norm = nn.LayerNorm(dim, eps=eps, elementwise_affine=False)
+        # Conditioning network: condition -> shift, scale, gate
+        output_dim = 3 * dim if return_gate else 2 * dim
+        self.condition_proj = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(condition_dim, output_dim),
+        )
+        # Initialize to zero for stable training
+        nn.init.zeros_(self.condition_proj[1].weight)
+        nn.init.zeros_(self.condition_proj[1].bias)
+    def forward(self, x: torch.Tensor, condition: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor] | None:
+        """
+        Args:
+            x: Input tensor of shape (B, L, dim)
+            condition: Conditioning tensor of shape (B, L, condition_dim) or (B, 1, condition_dim)
+        Returns:
+            modulated_x: Normalized and modulated features
+            gate: Gate values for scaling (None if return_gate=False)
+        """
+        x_norm = self.norm(x)
+        condition_params = self.condition_proj(condition)
+        if self.return_gate:
+            shift, scale, gate = condition_params.chunk(3, dim=-1)
+        else:
+            shift, scale = condition_params.chunk(2, dim=-1)
+            gate = None
+        modulated_x = x_norm * (1 + scale) + shift
+        return modulated_x, gate

src/kanade_tokenizer/module/audio_feature.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Adapted from:
+# Vocos: https://github.com/gemelo-ai/vocos/blob/main/vocos/feature_extractors.py
+# BigVGAN: https://github.com/NVIDIA/BigVGAN/blob/main/meldataset.py (Also used by HiFT)
+import torch
+import torchaudio
+from librosa.filters import mel as librosa_mel_fn
+from torch import nn
+def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
+    return torch.log(torch.clip(x, min=clip_val))
+class MelSpectrogramFeature(nn.Module):
+    def __init__(
+        self,
+        sample_rate: int = 24000,
+        n_fft: int = 1024,
+        hop_length: int = 256,
+        n_mels: int = 100,
+        padding: str = "center",
+        fmin: int = 0,
+        fmax: int | None = None,
+        bigvgan_style_mel: bool = False,
+    ):
+        super().__init__()
+        self.bigvgan_style_mel = bigvgan_style_mel
+        if bigvgan_style_mel:
+            # BigVGAN style: same padding, Slaney mel scale, with normalization
+            self.n_fft = n_fft
+            self.win_size = n_fft
+            self.hop_size = hop_length
+            # (n_mels, n_fft // 2 + 1)
+            mel_basis = librosa_mel_fn(
+                sr=sample_rate, n_fft=n_fft, n_mels=n_mels, norm="slaney", htk=False, fmin=fmin, fmax=fmax
+            )
+            mel_basis = torch.from_numpy(mel_basis).float()
+            hann_window = torch.hann_window(n_fft)
+            self.register_buffer("mel_basis", mel_basis)
+            self.register_buffer("hann_window", hann_window)
+        else:
+            # Vocos style: center padding, HTK mel scale, without normalization
+            if padding not in ["center", "same"]:
+                raise ValueError("Padding must be 'center' or 'same'.")
+            self.padding = padding
+            self.mel_spec = torchaudio.transforms.MelSpectrogram(
+                sample_rate=sample_rate,
+                n_fft=n_fft,
+                hop_length=hop_length,
+                n_mels=n_mels,
+                center=padding == "center",
+                power=1,
+                fmin=fmin,
+                fmax=fmax,
+            )
+    def forward(self, audio: torch.Tensor) -> torch.Tensor:
+        """
+        Returns:
+            mel_specgram (Tensor): Mel spectrogram of the input audio. (B, C, L)
+        """
+        if self.bigvgan_style_mel:
+            return self.bigvgan_mel(audio)
+        else:
+            return self.vocos_mel(audio)
+    def vocos_mel(self, audio: torch.Tensor) -> torch.Tensor:
+        if self.padding == "same":
+            pad = self.mel_spec.win_length - self.mel_spec.hop_length
+            audio = torch.nn.functional.pad(audio, (pad // 2, pad // 2), mode="reflect")
+        specgram = self.mel_spec.spectrogram(audio)
+        mel_specgram = self.mel_spec.mel_scale(specgram)
+        # Convert to log scale
+        mel_specgram = safe_log(mel_specgram)
+        return mel_specgram
+    def bigvgan_mel(self, audio: torch.Tensor) -> torch.Tensor:
+        # Pad so that the output length T = L // hop_length
+        padding = (self.n_fft - self.hop_size) // 2
+        audio = torch.nn.functional.pad(audio, (padding, padding), mode="reflect")
+        audio = audio.reshape(-1, audio.shape[-1])
+        spec = torch.stft(
+            audio,
+            n_fft=self.n_fft,
+            hop_length=self.hop_size,
+            win_length=self.win_size,
+            window=self.hann_window,
+            center=False,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+        spec = spec.reshape(audio.shape[:-1] + spec.shape[-2:])
+        spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
+        mel_spec = torch.matmul(self.mel_basis, spec)
+        mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
+        return mel_spec

src/kanade_tokenizer/module/convnext.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Adapted from: https://github.com/gemelo-ai/vocos/blob/main/vocos/models.py
+import torch
+from torch import nn
+class ConvNeXtBlock(nn.Module):
+    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
+    Args:
+        dim (int): Number of input channels.
+        intermediate_dim (int): Dimensionality of the intermediate layer.
+        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
+            Defaults to None.
+    """
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        layer_scale_init_value: float,
+    ):
+        super().__init__()
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        x = self.dwconv(x)
+        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        x = residual + x
+        return x
+class ConvNextBackbone(nn.Module):
+    """
+    Backbone module built with ConvNeXt blocks.
+    Args:
+        input_channels (int): Number of input features channels.
+        dim (int): Hidden dimension of the model.
+        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
+        num_layers (int): Number of ConvNeXtBlock layers.
+        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
+    """
+    def __init__(
+        self,
+        input_channels: int,
+        dim: int,
+        intermediate_dim: int,
+        num_layers: int,
+        output_channels: int | None = None,
+        layer_scale_init_value: float | None = None,
+        skip_embed: bool = False,
+    ):
+        super().__init__()
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        self.dim = dim
+        self.embed = nn.Conv1d(input_channels, dim, kernel_size=7, padding=3) if not skip_embed else nn.Identity()
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        layer_scale_init_value = layer_scale_init_value or 1 / num_layers
+        self.convnext = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    dim=dim,
+                    intermediate_dim=intermediate_dim,
+                    layer_scale_init_value=layer_scale_init_value,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.proj_out = nn.Linear(dim, output_channels) if output_channels else nn.Identity()
+        self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
+        self.apply(self._init_weights)
+    @property
+    def input_dim(self) -> int:
+        return self.input_channels
+    @property
+    def output_dim(self) -> int:
+        return self.output_channels if self.output_channels else self.dim
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv1d, nn.Linear)):
+            nn.init.trunc_normal_(m.weight, std=0.02)
+            nn.init.constant_(m.bias, 0)
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, L, C), where B is the batch size,
+                        C denotes output features, and L is the sequence length.
+        Returns:
+            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
+                    and H denotes the model dimension.
+        """
+        x = x.transpose(1, 2)  # (B, L, C) -> (B, C, L)
+        x = self.embed(x)
+        x = self.norm(x.transpose(1, 2))
+        x = x.transpose(1, 2)
+        for conv_block in self.convnext:
+            x = conv_block(x)
+        x = self.final_layer_norm(x.transpose(1, 2))
+        x = self.proj_out(x)
+        return x

src/kanade_tokenizer/module/discriminator.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Adapted from:
+# https://github.com/gemelo-ai/vocos/blob/main/vocos/discriminators.py
+# https://github.com/gemelo-ai/vocos/blob/main/vocos/loss.py
+import torch
+from einops import rearrange
+from torch import nn
+from torch.nn.utils.parametrizations import weight_norm
+def get_2d_padding(kernel_size: tuple[int, int], dilation: tuple[int, int] = (1, 1)):
+    return (((kernel_size[0] - 1) * dilation[0]) // 2, ((kernel_size[1] - 1) * dilation[1]) // 2)
+class SpectrogramDiscriminator(nn.Module):
+    def __init__(
+        self,
+        frequency_bins: int,
+        channels: int = 32,
+        kernel_size: tuple[int, int] = (3, 3),
+        dilation: list[int] = [1, 2, 4],
+        bands: tuple[tuple[float, float], ...] = ((0.0, 0.2), (0.2, 0.4), (0.4, 0.6), (0.6, 0.8), (0.8, 1.0)),
+        use_downsample: bool = True,
+    ):
+        super().__init__()
+        self.bands = [(int(b[0] * frequency_bins), int(b[1] * frequency_bins)) for b in bands]
+        self.stacks = nn.ModuleList()
+        for _ in self.bands:
+            stack = nn.ModuleList(
+                [weight_norm(nn.Conv2d(1, channels, kernel_size, padding=get_2d_padding(kernel_size)))]
+            )
+            for d in dilation:
+                # dilation on time axis
+                pad = get_2d_padding(kernel_size, (d, 1))
+                stack.append(weight_norm(nn.Conv2d(channels, channels, kernel_size, dilation=(d, 1), padding=pad)))
+            stack.append(weight_norm(nn.Conv2d(channels, channels, kernel_size, padding=get_2d_padding(kernel_size))))
+            self.stacks.append(stack)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, kernel_size, padding=get_2d_padding(kernel_size)))
+        if use_downsample:
+            self.downsample = nn.AvgPool2d(4, stride=2, padding=1, count_include_pad=False)
+        else:
+            self.downsample = nn.Identity()
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        """
+        Args:
+            x (Tensor): Input spectrogram (B, C, F, T).
+        Returns:
+            output (Tensor): Discriminator output.
+            intermediates (list[Tensor]): List of intermediate feature maps.
+        """
+        if x.dim() == 3:
+            x = x.unsqueeze(1)
+        assert x.dim() == 4, f"Expected 4D input, got {x.dim()}D"
+        # Split into bands
+        x = rearrange(x, "b c f t -> b c t f")
+        x_bands = [x[..., b[0] : b[1]] for b in self.bands]
+        x = []
+        intermediates = []
+        for x_band, stack in zip(x_bands, self.stacks):
+            for layer in stack:
+                x_band = layer(x_band)
+                x_band = torch.nn.functional.leaky_relu(x_band, 0.1)
+                intermediates.append(x_band)
+            x.append(x_band)
+        # Concatenate the outputs from all bands
+        x = torch.cat(x, dim=-1)
+        x = self.conv_post(x)
+        x = self.downsample(x)
+        return x, intermediates

src/kanade_tokenizer/module/fsq.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Finite Scalar Quantization: https://arxiv.org/abs/2309.15505
+import torch
+from torch import nn
+from ..util import get_logger
+logger = get_logger()
+def round_ste(z: torch.Tensor) -> torch.Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+def get_entropy(prob: torch.Tensor, eps: float = 1e-10) -> torch.Tensor:
+    return -torch.sum(prob * torch.log(prob + eps), dim=-1)
+class FSQ(nn.Module):
+    def __init__(self, levels: list[int]):
+        super().__init__()
+        self.levels = levels
+        self.dim = len(levels)
+        _levels = torch.tensor(levels, dtype=torch.long)
+        self.register_buffer("_levels", _levels, persistent=False)
+        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=torch.long)
+        self.register_buffer("_basis", _basis, persistent=False)
+    def bound(self, z: torch.Tensor, eps: float = 1e-3) -> torch.Tensor:
+        """Bound `z`, an array of shape (..., d)."""
+        half_l = (self._levels - 1) * (1 - eps) / 2
+        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).tan()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z: torch.Tensor) -> torch.Tensor:
+        """Quantizes z, returns quantized zhat, same shape as z."""
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2  # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized: torch.Tensor) -> torch.Tensor:
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat: torch.Tensor) -> torch.Tensor:
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def codes_to_indices(self, zhat: torch.Tensor) -> torch.Tensor:
+        """Converts a `code` to an index in the codebook."""
+        # (B, T, C) -> (B, T)
+        assert zhat.shape[-1] == len(self.levels)
+        zhat = self._scale_and_shift(zhat)
+        return (zhat * self._basis.to(torch.float64)).to(torch.long).sum(dim=-1)
+    def indices_to_codes(self, indices: torch.Tensor) -> torch.Tensor:
+        """Inverse of `codes_to_indices`."""
+        # (B, T) -> (B, T, C)
+        indices = indices.unsqueeze(-1)
+        codes_non_centered = (indices // self._basis) % self._levels
+        return self._scale_and_shift_inverse(codes_non_centered)
+    def encode(self, z: torch.Tensor) -> torch.Tensor:
+        z_q = self.quantize(z)
+        indices = self.codes_to_indices(z_q)  # (B, T)
+        return z_q, indices
+    def decode(self, indices: torch.Tensor) -> torch.Tensor:
+        z_q = self.indices_to_codes(indices)  # (B, T, C)
+        return z_q
+    def forward(self, z: torch.Tensor):
+        z_q = self.quantize(z)
+        indices = self.codes_to_indices(z_q)  # (B, T)
+        return z_q, indices
+class FiniteScalarQuantizer(nn.Module):
+    def __init__(self, input_dim: int, output_dim: int, levels: list[int]) -> None:
+        super().__init__()
+        self.input_dim_ = input_dim
+        self.output_dim_ = output_dim
+        self.fsq = FSQ(levels)
+        logger.debug(
+            f"Finite Scalar Quantizer with levels: {levels}, input_dim: {input_dim}, output_dim: {output_dim}, codebook_size: {self.all_codebook_size}"
+        )
+        self.proj_in = nn.Linear(input_dim, len(levels)) if len(levels) != input_dim else nn.Identity()
+        self.proj_out = nn.Linear(len(levels), output_dim) if len(levels) != output_dim else nn.Identity()
+    def build_codebook(self) -> None:
+        pass
+    @property
+    def output_dim(self) -> int:
+        return self.output_dim_
+    @property
+    def all_codebook_size(self) -> int:
+        size = 1
+        for level in self.fsq.levels:
+            size *= level
+        return size
+    def forward(self, z: torch.Tensor) -> tuple[torch.Tensor, dict]:
+        latent = self.proj_in(z)  # Latent projected by proj_in
+        quantized_latent, indices = self.fsq(latent)  # Quantized latent before proj_out
+        z_q = self.proj_out(quantized_latent)
+        # Compute perplexity from used indices distribution
+        flat_indices = indices.view(-1)
+        unique_indices, counts = torch.unique(flat_indices, return_counts=True)
+        used_indices_probs = counts.float() / flat_indices.numel()
+        entropy = get_entropy(used_indices_probs)
+        perplexity = torch.exp(entropy)
+        info_dict = {
+            "latent": latent,
+            "quantized_latent": quantized_latent,
+            "indices": indices,
+            "perplexity": perplexity,
+        }
+        return z_q, info_dict
+    def encode(self, z: torch.Tensor, skip_proj: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
+        z = self.proj_in(z)
+        z_q, indices = self.fsq.encode(z)
+        if not skip_proj:
+            z_q = self.proj_out(z_q)
+        return z_q, indices
+    def decode(self, indices: torch.Tensor) -> torch.Tensor:
+        z_q = self.fsq.decode(indices)
+        z_q = self.proj_out(z_q)
+        return z_q

src/kanade_tokenizer/module/global_encoder.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Adapted from: https://github.com/microsoft/UniSpeech/blob/main/downstreams/speaker_verification/models/ecapa_tdnn.py
+import torch
+import torch.nn as nn
+from .convnext import ConvNextBackbone
+class AttentiveStatsPool(nn.Module):
+    def __init__(self, input_channels: int, output_channels: int, attention_channels: int = 128):
+        super().__init__()
+        self.attn = nn.Sequential(
+            nn.Conv1d(input_channels, attention_channels, kernel_size=1),
+            nn.Tanh(),
+            nn.Conv1d(attention_channels, input_channels, kernel_size=1),
+            nn.Softmax(dim=2),
+        )
+        self.proj = nn.Linear(input_channels * 2, output_channels)
+        self.norm = nn.LayerNorm(output_channels)
+    def forward(self, x):
+        alpha = self.attn(x)
+        mean = torch.sum(alpha * x, dim=2)
+        residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
+        std = torch.sqrt(residuals.clamp(min=1e-4, max=1e4))
+        x = torch.cat([mean, std], dim=1)
+        return self.norm(self.proj(x))
+class GlobalEncoder(nn.Module):
+    def __init__(
+        self,
+        input_channels: int,
+        output_channels: int,
+        dim: int,
+        intermediate_dim: int,
+        num_layers: int,
+        skip_embed: bool = False,
+        attention_channels: int = 128,
+        use_attn_pool: bool = True,
+    ):
+        super().__init__()
+        self.backbone = ConvNextBackbone(
+            input_channels=input_channels,
+            dim=dim,
+            intermediate_dim=intermediate_dim,
+            num_layers=num_layers,
+            skip_embed=skip_embed,
+        )
+        if use_attn_pool:
+            self.pooling = AttentiveStatsPool(
+                input_channels=dim, output_channels=output_channels, attention_channels=attention_channels
+            )
+        else:
+            self.pooling = nn.Sequential(
+                nn.AdaptiveAvgPool1d(1),
+                nn.Flatten(1),
+                nn.Linear(dim, output_channels),
+                nn.LayerNorm(output_channels),
+            )
+        self.output_channels = output_channels
+    @property
+    def output_dim(self):
+        return self.output_channels
+    def forward(self, x):
+        features = self.backbone(x)
+        # (B, T, C) -> (B, C, T)
+        features = features.transpose(1, 2)
+        return self.pooling(features)  # (B, C_out)

src/kanade_tokenizer/module/hift.py ADDED Viewed

	@@ -0,0 +1,685 @@

+# Adapted from: https://github.com/yl4579/HiFTNet/blob/main/models.py
+# https://github.com/FunAudioLLM/CosyVoice/blob/main/cosyvoice/hifigan/generator.py
+from typing import Dict, List, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.signal import get_window
+from torch.distributions.uniform import Uniform
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils.parametrizations import weight_norm
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def mel_spec_transform(
+    audio: torch.Tensor,
+    n_fft: int,
+    n_mels: int,
+    sample_rate: int,
+    hop_size: int,
+    win_size: int,
+    fmin: int = 0,
+    fmax: Optional[int] = None,
+):
+    from librosa.filters import mel as librosa_mel_fn
+    # (n_mels, n_fft // 2 + 1)
+    mel_basis = librosa_mel_fn(
+        sr=sample_rate, n_fft=n_fft, n_mels=n_mels, norm="slaney", htk=False, fmin=fmin, fmax=fmax
+    )
+    mel_basis = torch.from_numpy(mel_basis).float()
+    hann_window = torch.hann_window(win_size)
+    # Pad so that the output length T = L // hop_length
+    padding = (n_fft - hop_size) // 2
+    audio = torch.nn.functional.pad(audio, (padding, padding), mode="reflect")
+    audio = audio.reshape(-1, audio.shape[-1])
+    # (B, n_fft // 2 + 1, T=1 + (L' - n_fft) // hop_length)
+    # L' = L + n_fft - hop_length
+    # T = L // hop_length
+    spec = torch.stft(
+        audio,
+        n_fft=n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=False,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = spec.reshape(audio.shape[:-1] + spec.shape[-2:])
+    spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
+    mel_spec = torch.matmul(mel_basis, spec)
+    mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
+    return mel_spec
+class Snake(nn.Module):
+    """
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        """
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)
+        return x
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation),
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([Snake(channels, alpha_logscale=False) for _ in range(len(self.convs1))])
+        self.activations2 = nn.ModuleList([Snake(channels, alpha_logscale=False) for _ in range(len(self.convs2))])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self, num_class: int = 1, in_channels: int = 80, cond_channels: int = 512):
+        super().__init__()
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)),
+            nn.ELU(),
+            weight_norm(nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)),
+            nn.ELU(),
+            weight_norm(nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)),
+            nn.ELU(),
+            weight_norm(nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)),
+            nn.ELU(),
+            weight_norm(nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i : i + 1, :] = f0 * (i + 1) / self.sampling_rate
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(
+        self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0
+    ):
+        super(SourceModuleHnNSF, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
+            sine_wavs = sine_wavs.transpose(1, 2)
+            uv = uv.transpose(1, 2)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class SineGen2(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(
+        self,
+        samp_rate,
+        upsample_scale,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen2, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+        self.upsample_scale = upsample_scale
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+    def _f02sine(self, f0_values):
+        """f0_values: (batchsize, length, dim)
+        where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            rad_values = torch.nn.functional.interpolate(
+                rad_values.transpose(1, 2), scale_factor=1 / self.upsample_scale, mode="linear"
+            ).transpose(1, 2)
+            phase = torch.cumsum(rad_values, dim=1) * 2 * np.pi
+            phase = torch.nn.functional.interpolate(
+                phase.transpose(1, 2) * self.upsample_scale, scale_factor=self.upsample_scale, mode="linear"
+            ).transpose(1, 2)
+            sines = torch.sin(phase)
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        # fundamental component
+        fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
+        # generate sine waveforms
+        sine_waves = self._f02sine(fn) * self.sine_amp
+        # generate uv signal
+        uv = self._f02uv(f0)
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class SourceModuleHnNSF2(torch.nn.Module):
+    """SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+    def __init__(
+        self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0
+    ):
+        super(SourceModuleHnNSF2, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen2(sampling_rate, upsample_scale, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+class HiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+        self,
+        in_channels: int = 80,
+        base_channels: int = 512,
+        nb_harmonics: int = 8,
+        sampling_rate: int = 24000,
+        nsf_alpha: float = 0.1,
+        nsf_sigma: float = 0.003,
+        nsf_voiced_threshold: float = 10,
+        upsample_rates: list[int] = [8, 5, 3],
+        upsample_kernel_sizes: list[int] = [16, 11, 7],
+        istft_n_fft: int = 16,
+        istft_hop_len: int = 4,
+        resblock_kernel_sizes: list[int] = [3, 7, 11],
+        resblock_dilation_sizes: list[list[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        source_resblock_kernel_sizes: list[int] = [7, 7, 11],
+        source_resblock_dilation_sizes: list[list[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        lrelu_slope: float = 0.1,
+        audio_limit: float = 0.99,
+        f0_predictor_channels: int = 512,
+    ):
+        super(HiFTGenerator, self).__init__()
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_n_fft = istft_n_fft
+        self.istft_hop_len = istft_hop_len
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF2(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_hop_len,
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold,
+        )
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_hop_len)
+        self.conv_pre = weight_norm(Conv1d(in_channels, base_channels, 7, 1, padding=3))
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i), base_channels // (2 ** (i + 1)), k, u, padding=(k - u) // 2
+                    )
+                )
+            )
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(
+            zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)
+        ):
+            if u == 1:
+                self.source_downs.append(Conv1d(istft_n_fft + 2, base_channels // (2 ** (i + 1)), 1, 1))
+            else:
+                self.source_downs.append(
+                    Conv1d(istft_n_fft + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+            self.source_resblocks.append(ResBlock(base_channels // (2 ** (i + 1)), k, d))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2 ** (i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, istft_n_fft + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_n_fft, fftbins=True).astype(np.float32))
+        self.f0_predictor = ConvRNNF0Predictor(
+            num_class=1, in_channels=in_channels, cond_channels=f0_predictor_channels
+        )
+    def remove_weight_norm(self):
+        for layer in self.ups:
+            remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for layer in self.source_downs:
+            remove_weight_norm(layer)
+        for layer in self.source_resblocks:
+            layer.remove_weight_norm()
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_n_fft,
+            self.istft_hop_len,
+            self.istft_n_fft,
+            window=self.stft_window.to(x.device),
+            return_complex=True,
+        )
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(
+            torch.complex(real, img),
+            self.istft_n_fft,
+            self.istft_hop_len,
+            self.istft_n_fft,
+            window=self.stft_window.to(magnitude.device),
+        )
+        return inverse_transform
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, : self.istft_n_fft // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_n_fft // 2 + 1 :, :])  # actually, sin is redundancy
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+    def forward(self, speech_feat: torch.Tensor) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = speech_feat.transpose(1, 2)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor) -> torch.Tensor:
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech
+    def load_weights(self, weights_path: str):
+        checkpoint = torch.load(weights_path, map_location="cpu")
+        state_dict = {k.replace("generator.", ""): v for k, v in checkpoint.items()}
+        self.load_state_dict(state_dict, strict=True)

src/kanade_tokenizer/module/postnet.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Adapted from: https://github.com/ming024/FastSpeech2
+import torch
+import torch.nn as nn
+def get_padding(kernel_size: int, dilation: int = 1):
+    return ((kernel_size - 1) * dilation) // 2
+class Norm(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.norm = nn.LayerNorm(channels)
+    def forward(self, x):
+        # (batch_size, channels, sequence_length)
+        x = x.transpose(1, 2)
+        x = self.norm(x)
+        return x.transpose(1, 2)
+class PostNet(nn.Module):
+    def __init__(
+        self,
+        input_channels: int = 100,
+        channels: int = 512,
+        kernel_size: int = 5,
+        num_layers: int = 5,
+        dropout: float = 0.5,
+        use_layer_norm: bool = False,
+    ):
+        super().__init__()
+        padding = get_padding(kernel_size)
+        self.convolutions = nn.ModuleList()
+        self.convolutions.append(
+            nn.Sequential(
+                nn.Conv1d(input_channels, channels, kernel_size=kernel_size, padding=padding),
+                Norm(channels) if use_layer_norm else nn.BatchNorm1d(channels),
+            )
+        )
+        for i in range(1, num_layers - 1):
+            self.convolutions.append(
+                nn.Sequential(
+                    nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding),
+                    Norm(channels) if use_layer_norm else nn.BatchNorm1d(channels),
+                )
+            )
+        self.convolutions.append(
+            nn.Sequential(
+                nn.Conv1d(channels, input_channels, kernel_size=kernel_size, padding=padding),
+                Norm(input_channels) if use_layer_norm else nn.BatchNorm1d(input_channels),
+            )
+        )
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        residual = x
+        for i in range(len(self.convolutions) - 1):
+            x = self.convolutions[i](x)
+            x = torch.tanh(x)
+            x = self.dropout(x)
+        x = self.convolutions[-1](x)
+        x = self.dropout(x)
+        return x + residual

src/kanade_tokenizer/module/ssl_extractor.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+import torch.nn as nn
+import torchaudio
+import torchaudio.pipelines as pipelines
+from torchaudio.models.wav2vec2 import Wav2Vec2Model
+from torchaudio.models.wav2vec2.components import ConvLayerBlock
+from ..util import get_logger
+logger = get_logger()
+# Map of friendly names to torchaudio pipeline bundles
+MODEL_REGISTRY = {
+    "wav2vec2_base": pipelines.WAV2VEC2_BASE,
+    "wav2vec2_large": pipelines.WAV2VEC2_LARGE,
+    "wav2vec2_large_lv60k": pipelines.WAV2VEC2_LARGE_LV60K,
+    "hubert_base": pipelines.HUBERT_BASE,
+    "hubert_large": pipelines.HUBERT_LARGE,
+    "hubert_xlarge": pipelines.HUBERT_XLARGE,
+    "wavlm_base": pipelines.WAVLM_BASE,
+    "wavlm_base_plus": pipelines.WAVLM_BASE_PLUS,
+    "wavlm_large": pipelines.WAVLM_LARGE,
+}
+class SSLFeatureExtractor(nn.Module):
+    def __init__(self, model_name: str = "wavlm_base_plus", output_layer: int | None = None, sample_rate: int = 16000):
+        """
+        Args:
+            model_name: Name of the SSL model to use
+            output_layer: Which layer's features to extract (None for last layer), 1-based indexing
+            sample_rate: Sample rate of input audio
+        """
+        super().__init__()
+        self.output_layer = output_layer if output_layer is not None else -1
+        if model_name not in MODEL_REGISTRY:
+            raise ValueError(f"Unknown model: {model_name}. Available models: {list(MODEL_REGISTRY.keys())}")
+        bundle = MODEL_REGISTRY[model_name]
+        self.model: Wav2Vec2Model = bundle.get_model()
+        self.model.eval()
+        self.feature_dim: int = bundle._params["encoder_embed_dim"]
+        self.ssl_sample_rate = bundle.sample_rate
+        # Create resampler if needed
+        if sample_rate != self.ssl_sample_rate:
+            logger.debug(f"Resampling from {sample_rate} to {self.ssl_sample_rate} required by {model_name}.")
+            self.resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.ssl_sample_rate)
+        else:
+            self.resampler = None
+    @property
+    def hop_size(self) -> int:
+        """Get the hop size of the model's convolutional layers."""
+        hop_size = 1
+        for _, stride in self.conv_config:
+            hop_size *= stride
+        return hop_size
+    @property
+    def conv_config(self) -> list[tuple[int, int]]:
+        """Get the configuration of the convolutional layers in the model."""
+        conv_layers = []
+        for layer in self.model.feature_extractor.conv_layers:
+            layer: ConvLayerBlock
+            conv_layers.append((layer.kernel_size, layer.stride))
+        return conv_layers
+    def get_minimum_input_length(self, desired_output_length: int) -> int:
+        """Calculate the minimum input length required to produce a given output length."""
+        length = desired_output_length
+        for kernel_size, stride in reversed(self.conv_config):
+            length = (length - 1) * stride + kernel_size
+        return length
+    @torch.no_grad()
+    def forward(
+        self,
+        waveform: torch.Tensor,
+        lengths: torch.Tensor | None = None,
+        num_layers: int | None = None,
+        return_lengths: bool = False,
+    ) -> list[torch.Tensor]:
+        """
+        Args:
+            waveform: (batch_size, num_samples)
+            lengths: Optional tensor of sequence lengths for each batch item (used for attention masking)
+        Returns:
+            features: List of feature tensors for each layer (batch_size, frame, dim)
+            lengths: Sequence lengths for each batch item
+        """
+        if waveform.dim() == 1:
+            waveform = waveform.unsqueeze(0)
+        # Resample if needed
+        if self.resampler is not None:
+            waveform = self.resampler(waveform)
+        features, feature_lengths = self.model.extract_features(
+            waveform, lengths, num_layers=num_layers or self.output_layer
+        )
+        if return_lengths:
+            return features, feature_lengths
+        return features

src/kanade_tokenizer/module/transformer.py ADDED Viewed

	@@ -0,0 +1,549 @@

+# Adapted from https://github.com/meta-llama/llama3/blob/main/llama/model.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed in accordance with the terms of the Llama 3 Community License Agreement.
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ..util import get_logger
+from .adaln_zero import AdaLNZero
+logger = get_logger()
+try:
+    from flash_attn import flash_attn_func, flash_attn_with_kvcache
+    FLASH_ATTN_AVAILABLE = True
+except ImportError:
+    FLASH_ATTN_AVAILABLE = False
+    logger.warning(
+        "FlashAttention is not installed. Falling back to PyTorch SDPA implementation. There is no warranty that the model will work correctly."
+    )
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    x_ = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, x_)
+    x_out = torch.view_as_real(x_ * freqs_cis).flatten(3)
+    return x_out.type_as(x)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        dropout: float,
+        window_size: int | None,
+        qkv_bias: bool = False,
+        proj_bias: bool = False,
+        use_flash_attention: bool = False,
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.n_heads = n_heads
+        self.head_dim = dim // n_heads
+        self.wq = nn.Linear(dim, n_heads * self.head_dim, bias=qkv_bias)
+        self.wk = nn.Linear(dim, n_heads * self.head_dim, bias=qkv_bias)
+        self.wv = nn.Linear(dim, n_heads * self.head_dim, bias=qkv_bias)
+        self.wo = nn.Linear(n_heads * self.head_dim, dim, bias=proj_bias)
+        self.scale = self.head_dim**-0.5
+        self.dropout = dropout
+        # Enable local attention if window_size is specified
+        self.use_local_attention = window_size is not None
+        if self.use_local_attention:
+            assert window_size % 2 == 1, "Window size must be odd for local attention."
+            self.window_per_side = window_size // 2
+        self.use_flash_attention = use_flash_attention
+        self.causal = causal
+    def create_mask(
+        self, bsz: int, seqlen: int, mask: torch.Tensor | None, device: torch.device
+    ) -> torch.Tensor | None:
+        """Create attention mask combining provided mask and local attention constraints"""
+        if not self.use_local_attention and mask is None:
+            return None
+        # Start with all positions allowed
+        attn_mask = torch.ones((seqlen, seqlen), dtype=torch.bool, device=device)
+        if self.causal:
+            # Causal mask: no future positions allowed
+            attn_mask = torch.tril(attn_mask)
+        # Apply local attention constraints
+        if self.use_local_attention:
+            attn_mask = torch.triu(attn_mask, diagonal=-self.window_per_side)
+            attn_mask = torch.tril(attn_mask, diagonal=self.window_per_side)
+        # Expand mask to batch size
+        attn_mask = attn_mask.unsqueeze(0).expand(bsz, -1, -1)
+        # Apply global mask if provided
+        if mask is not None:
+            assert mask.shape[-1] == seqlen and mask.shape[-2] == seqlen, (
+                "Mask must be square and match sequence length."
+            )
+            # Ensure mask has correct batch dimensions
+            if mask.dim() == 2:
+                mask = mask.unsqueeze(0).expand(bsz, -1, -1)
+            attn_mask = attn_mask & mask
+        # Expand to head dimension
+        attn_mask = attn_mask.unsqueeze(1).expand(-1, self.n_heads, -1, -1)
+        return attn_mask
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cis: torch.Tensor | None,
+        mask: torch.Tensor | None,
+        return_kv: bool = False,
+    ) -> torch.Tensor | tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        """Forward pass for multi-head attention.
+        Args:
+            x (torch.Tensor): Input tensor of shape (bsz, seqlen, dim).
+            freqs_cis (torch.Tensor, optional): Precomputed rotary frequencies.
+            mask (torch.Tensor, optional): Attention mask.
+            return_kv (bool): Whether to return KV pairs for caching.
+        Returns:
+            output (torch.Tensor): Output tensor of shape (bsz, seqlen, dim).
+            new_kv (tuple, optional): KV pairs if return_kv is True.
+        """
+        bsz, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_heads, self.head_dim)
+        # Apply rotary embeddings if provided
+        if freqs_cis is not None:
+            xq = apply_rotary_emb(xq, freqs_cis=freqs_cis[:seqlen])
+            xk = apply_rotary_emb(xk, freqs_cis=freqs_cis[:seqlen])
+        if self.use_flash_attention and FLASH_ATTN_AVAILABLE:
+            assert mask is None, "Flash attention does not support arbitrary masking."
+            # Flash Attention
+            window_size = (self.window_per_side, self.window_per_side) if self.use_local_attention else (-1, -1)
+            output = flash_attn_func(
+                xq,  # (bsz, seqlen, n_heads, head_dim)
+                xk,  # (bsz, seqlen, n_heads, head_dim)
+                xv,  # (bsz, seqlen, n_heads, head_dim)
+                dropout_p=(self.dropout if self.training else 0.0),
+                softmax_scale=self.scale,
+                window_size=window_size,
+                causal=self.causal,
+            )  # (bsz, seqlen, n_heads, head_dim)
+        else:
+            attn_mask = self.create_mask(bsz, seqlen, mask, x.device)
+            # SDPA Attention
+            output = F.scaled_dot_product_attention(
+                xq.transpose(1, 2),  # (bsz, n_heads, seqlen, head_dim)
+                xk.transpose(1, 2),  # (bsz, n_heads, seqlen, head_dim)
+                xv.transpose(1, 2),  # (bsz, n_heads, seqlen, head_dim)
+                attn_mask=attn_mask,  # (bsz, n_heads, seqlen, seqlen) boolean mask
+                dropout_p=self.dropout,
+                scale=self.scale,
+            ).transpose(1, 2)  # (bsz, seqlen, n_heads, head_dim)
+        output = output.contiguous().view(bsz, seqlen, -1)
+        output = self.wo(output)
+        if return_kv:
+            return output, (xk, xv)
+        return output
+    def forward_with_cache(
+        self,
+        x: torch.Tensor,
+        kv_cache: tuple[torch.Tensor, torch.Tensor],
+        freqs_cis: torch.Tensor,
+        start_pos: int,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Forward pass with KV cache for efficient inference. Only used for inference.
+        Args:
+            x (torch.Tensor): Input tensor for the current step. Shape: (bsz, 1, dim)
+            kv_cache: A tuple of (key_cache, value_cache) from previous steps.
+            start_pos (int): The starting position of the new token in the sequence.
+            freqs_cis (torch.Tensor): Precomputed rotary frequencies.
+        Returns:
+            output (torch.Tensor): Output tensor after attention. Shape: (bsz, 1, dim)
+            new_kv (tuple): Updated KV cache including the new key and value.
+        """
+        bsz, seqlen, _ = x.shape
+        assert seqlen == 1, "KV cache method is designed for single-token generation."
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        xq = xq.view(bsz, seqlen, self.n_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_heads, self.head_dim)
+        # Apply rotary embeddings using the correct positional slice
+        xq = apply_rotary_emb(xq, freqs_cis=freqs_cis[start_pos : start_pos + seqlen])
+        xk = apply_rotary_emb(xk, freqs_cis=freqs_cis[start_pos : start_pos + seqlen])
+        # Update the KV cache
+        k_cache, v_cache = kv_cache
+        new_kv = (xk, xv)
+        xk = torch.cat([k_cache, xk], dim=1)
+        xv = torch.cat([v_cache, xv], dim=1)
+        # For single token generation, causal mask is implicitly handled.
+        # We attend to all keys (prefix + previous tokens).
+        if self.use_flash_attention and FLASH_ATTN_AVAILABLE:
+            # Flash Attention
+            output = flash_attn_with_kvcache(
+                xq,  # (bsz, 1, n_heads, head_dim)
+                xk,  # (bsz, 1+kv_len, n_heads, head_dim)
+                xv,  # (bsz, 1+kv_len, n_heads, head_dim)
+                softmax_scale=self.scale,
+            )  # (bsz, 1, n_heads, head_dim)
+        else:
+            # SDPA Attention
+            output = F.scaled_dot_product_attention(
+                xq.transpose(1, 2),  # (bsz, n_heads, 1, head_dim)
+                xk.transpose(1, 2),  # (bsz, n_heads, 1+kv_len, head_dim)
+                xv.transpose(1, 2),  # (bsz, n_heads, 1+kv_len, head_dim)
+                scale=self.scale,
+            ).transpose(1, 2)  # (bsz, 1, n_heads, head_dim)
+        output = output.contiguous().view(bsz, seqlen, -1)
+        return self.wo(output), new_kv
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float | None,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        qkv_bias: bool,
+        proj_bias: bool,
+        window_size: int | None,
+        multiple_of: int,
+        ffn_dim_multiplier: float | None,
+        dropout: float,
+        norm_eps: float,
+        adanorm_condition_dim: int | None = None,
+        use_flash_attention: bool = False,
+        use_adaln_zero: bool = False,
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.attention = Attention(
+            dim=dim,
+            n_heads=n_heads,
+            dropout=dropout,
+            window_size=window_size,
+            use_flash_attention=use_flash_attention,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            causal=causal,
+        )
+        self.feed_forward = FeedForward(
+            dim=dim,
+            hidden_dim=4 * dim,
+            multiple_of=multiple_of,
+            ffn_dim_multiplier=ffn_dim_multiplier,
+        )
+        # Choose between AdaLNZero and regular LayerNorm
+        self.use_adaln_zero = use_adaln_zero
+        if self.use_adaln_zero:
+            assert adanorm_condition_dim is not None, "condition_dim must be provided when using AdaLNZero"
+            self.attention_norm = AdaLNZero(dim, adanorm_condition_dim, eps=norm_eps, return_gate=True)
+            self.ffn_norm = AdaLNZero(dim, adanorm_condition_dim, eps=norm_eps, return_gate=True)
+        else:
+            self.attention_norm = nn.LayerNorm(dim, eps=norm_eps)
+            self.ffn_norm = nn.LayerNorm(dim, eps=norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cis: torch.Tensor | None,
+        mask: torch.Tensor | None,
+        condition: torch.Tensor | None = None,
+        return_kv: bool = False,
+        kv_cache: tuple[torch.Tensor, torch.Tensor] | None = None,
+        start_pos: int | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Forward pass for a single Transformer block.
+        Args:
+            x (torch.Tensor): Input tensor of shape (bsz, seqlen, dim).
+            freqs_cis (torch.Tensor, optional): Precomputed rotary frequencies.
+            mask (torch.Tensor, optional): Attention mask.
+            condition (torch.Tensor, optional): Conditioning tensor for AdaLNZero.
+            return_kv (bool): Whether to return KV pairs for caching.
+            kv_cache (tuple, optional): KV cache for efficient inference.
+            start_pos (int, optional): Starting position for KV cache.
+        Returns:
+            out (torch.Tensor): Output tensor of shape (bsz, seqlen, dim).
+            new_kv (tuple, optional): New KV pairs if return_kv is True or kv_cache is provided.
+        """
+        # Apply normalization
+        if self.use_adaln_zero:
+            assert condition is not None, "condition must be provided when using AdaLNZero"
+            attn_normed, attn_gate = self.attention_norm(x, condition=condition)
+        else:
+            attn_normed = self.attention_norm(x)
+        # Forward attention with KV cache if provided
+        new_kv = None
+        if kv_cache is not None and start_pos is not None:
+            # Use KV cache for efficient inference
+            attn_out, new_kv = self.attention.forward_with_cache(attn_normed, kv_cache, freqs_cis, start_pos)
+        elif return_kv:
+            # Return KV pairs for caching
+            attn_out, new_kv = self.attention(attn_normed, freqs_cis, mask, return_kv=True)
+        else:
+            attn_out = self.attention(attn_normed, freqs_cis, mask)
+        # Apply gating for attention if using AdaLNZero
+        if self.use_adaln_zero:
+            h = x + attn_gate * attn_out  # residual + gate * x
+        else:
+            h = x + attn_out
+        # Apply normalization for feedforward
+        if self.use_adaln_zero:
+            ffn_normed, ffn_gate = self.ffn_norm(h, condition=condition)
+        else:
+            ffn_normed = self.ffn_norm(h)
+        ffn_out = self.feed_forward(ffn_normed)
+        # Apply gating for feedforward if using AdaLNZero
+        if self.use_adaln_zero:
+            out = h + ffn_gate * ffn_out  # residual + gate * x
+        else:
+            out = h + ffn_out
+        # If using KV cache, return the new KV pairs
+        if new_kv is not None:
+            return out, new_kv
+        return out
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        dim: int = 4096,
+        n_layers: int = 32,
+        n_heads: int = 32,
+        qkv_bias: bool = False,
+        proj_bias: bool = False,
+        window_size: int | None = None,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: float | None = None,
+        dropout: float = 0.1,
+        norm_eps: float = 1e-5,
+        use_rope: bool = True,
+        rope_theta: float = 500000.0,
+        max_seq_len: int = 2048,
+        input_dim: int | None = None,
+        output_dim: int | None = None,
+        adanorm_condition_dim: int | None = None,
+        use_flash_attention: bool = False,
+        use_adaln_zero: bool = False,
+        use_xavier_init: bool = True,
+        causal: bool = False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.n_heads = n_heads
+        self.rope_theta = rope_theta
+        self.use_adaln_zero = use_adaln_zero
+        self.layers = nn.ModuleList()
+        for layer_id in range(n_layers):
+            self.layers.append(
+                TransformerBlock(
+                    dim=dim,
+                    n_heads=n_heads,
+                    window_size=window_size,
+                    multiple_of=multiple_of,
+                    ffn_dim_multiplier=ffn_dim_multiplier,
+                    dropout=dropout,
+                    qkv_bias=qkv_bias,
+                    proj_bias=proj_bias,
+                    norm_eps=norm_eps,
+                    adanorm_condition_dim=adanorm_condition_dim,
+                    use_flash_attention=use_flash_attention,
+                    use_adaln_zero=use_adaln_zero,
+                    causal=causal,
+                )
+            )
+        # Choose between AdaLNZero (without gate) and regular LayerNorm for final norm
+        if self.use_adaln_zero:
+            assert adanorm_condition_dim is not None, "condition_dim must be provided when using AdaLNZero"
+            self.norm = AdaLNZero(dim, adanorm_condition_dim, eps=norm_eps, return_gate=False)
+        else:
+            self.norm = nn.LayerNorm(dim, eps=norm_eps)
+        self.input_proj = nn.Linear(input_dim, dim) if input_dim is not None else nn.Identity()
+        self.output_proj = nn.Linear(dim, output_dim) if output_dim is not None else nn.Identity()
+        self.output_dim_ = output_dim if output_dim is not None else dim
+        if use_rope:
+            self.freqs_cis = precompute_freqs_cis(dim // n_heads, max_seq_len * 2, rope_theta)
+            logger.debug(
+                f"Using RoPE with theta={rope_theta}, max_seq_len={max_seq_len}, "
+                f"dim={dim}, n_heads={n_heads}, freqs_cis shape={self.freqs_cis.shape}"
+            )
+        else:
+            self.freqs_cis = None
+        if window_size is not None:
+            logger.debug(f"Using local attention with window size {window_size}")
+        if self.use_adaln_zero:
+            logger.debug(f"Using AdaLNZero conditioning with condition_dim={adanorm_condition_dim}")
+        if use_flash_attention:
+            logger.debug("Using Flash Attention for memory-efficient attention computation")
+        if use_xavier_init:
+            logger.debug("Using Xavier initialization for linear layers")
+            self.apply(self._init_weights)
+            self.apply(self._init_adaln_zero)
+    @property
+    def output_dim(self) -> int:
+        return self.output_dim_
+    def _init_weights(self, module: nn.Module):
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    def _init_adaln_zero(self, module: nn.Module):
+        if isinstance(module, AdaLNZero):
+            # Initialize condition projection weights to zero
+            nn.init.zeros_(module.condition_proj[1].weight)
+            nn.init.zeros_(module.condition_proj[1].bias)
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor | None = None,
+        condition: torch.Tensor | None = None,
+        return_kv: bool = False,
+        kv_cache: list[tuple[torch.Tensor, torch.Tensor]] | None = None,
+        start_pos: int | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, list[tuple[torch.Tensor, torch.Tensor]]]:
+        """
+        Forward pass for the Transformer model.
+        Args:
+            x (torch.Tensor): Input tensor of shape (bsz, seqlen, input_dim).
+            mask (torch.Tensor, optional): Attention mask.
+            condition (torch.Tensor, optional): Conditioning tensor for AdaLNZero.
+            return_kv (bool): Whether to return KV pairs for caching.
+            kv_cache (list, optional): List of KV caches for each layer for efficient inference.
+            start_pos (int, optional): Starting position for KV cache.
+        Returns:
+            output (torch.Tensor): Output tensor of shape (bsz, seqlen, output_dim).
+            new_kv_list (list, optional): List of new KV pairs for each layer if return_kv is True or kv_cache is provided.
+        """
+        bsz, seqlen, _dim = x.shape
+        if self.use_adaln_zero:
+            assert condition is not None, "condition must be provided when using AdaLNZero"
+        # Rotary embeddings
+        if self.freqs_cis is not None:
+            # Recompute freqs_cis if the sequence length or starting position exceeds the precomputed length
+            expected_len = (start_pos + 1) if start_pos is not None else seqlen
+            if expected_len > self.freqs_cis.shape[0]:
+                logger.warning(
+                    f"Input sequence length {expected_len} exceeds precomputed RoPE length {self.freqs_cis.shape[0]}. Recomputing freqs_cis."
+                )
+                self.freqs_cis = precompute_freqs_cis(self.dim // self.n_heads, expected_len * 4, self.rope_theta)
+            self.freqs_cis = self.freqs_cis.to(x.device)
+            freqs_cis = self.freqs_cis
+        else:
+            freqs_cis = None
+        x = self.input_proj(x)
+        new_kv_list = []
+        for i, layer in enumerate(self.layers):
+            # Collect KV cache if provided
+            if kv_cache is not None and start_pos is not None:
+                x, new_kv = layer(x, freqs_cis, mask, condition, kv_cache=kv_cache[i], start_pos=start_pos)
+                new_kv_list.append(new_kv)
+            elif return_kv:
+                x, new_kv = layer(x, freqs_cis, mask, condition, return_kv=True)
+                new_kv_list.append(new_kv)
+            else:
+                x = layer(x, freqs_cis, mask, condition)
+        # Apply final normalization
+        if self.use_adaln_zero:
+            x, _ = self.norm(x, condition=condition)  # Final norm doesn't use gate
+        else:
+            x = self.norm(x)
+        output = self.output_proj(x)
+        # If using KV cache, return the new KV pairs
+        if new_kv_list:
+            return output, new_kv_list
+        return output

src/kanade_tokenizer/pipeline.py ADDED Viewed

	@@ -0,0 +1,760 @@

+from dataclasses import dataclass
+from typing import Literal
+import jsonargparse
+import lightning as L
+import numpy as np
+import torch
+import torch.nn.functional as F
+import yaml
+from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import OneCycleLR
+from .data.datamodule import AudioBatch
+from .model import KanadeModel, KanadeModelConfig
+from .module.audio_feature import MelSpectrogramFeature
+from .module.discriminator import SpectrogramDiscriminator
+from .module.fsq import FiniteScalarQuantizer
+from .module.global_encoder import GlobalEncoder
+from .module.postnet import PostNet
+from .module.ssl_extractor import SSLFeatureExtractor
+from .module.transformer import Transformer
+from .util import freeze_modules, get_logger, load_vocoder, vocode
+logger = get_logger()
+@dataclass
+class KanadePipelineConfig:
+    # Training control
+    train_feature: bool = True  # Whether to train the feature reconstruction branch
+    train_mel: bool = True  # Whether to train the mel spectrogram generation branch
+    # Audio settings
+    audio_length: int = 138240  # Length of audio input in samples
+    # Optimization settings
+    lr: float = 2e-4
+    weight_decay: float = 1e-4
+    betas: tuple[float, float] = (0.9, 0.99)
+    gradient_clip_val: float | None = 1.0
+    # LR scheduling parameters
+    warmup_percent: float = 0.1
+    lr_div_factor: float = 10.0
+    lr_final_div_factor: float = 1.0
+    anneal_mode: str = "cos"
+    # Loss weights
+    feature_l1_weight: float = 30.0
+    feature_l2_weight: float = 0.0
+    mel_l1_weight: float = 30.0
+    mel_l2_weight: float = 0.0
+    adv_weight: float = 1.0
+    feature_matching_weight: float = 10.0
+    # GAN settings
+    use_discriminator: bool = False
+    adv_loss_type: Literal["hinge", "least_square"] = "hinge"  # Type of adversarial loss
+    discriminator_lr: float | None = None  # Learning rate for discriminator
+    discriminator_start_step: int = 0  # Step to start training discriminator
+    discriminator_update_prob: float = 1.0  # Probability of updating discriminator at each step
+    # Checkpoint loading
+    ckpt_path: str | None = None  # Path to checkpoint to load from
+    skip_loading_modules: tuple[str, ...] = ()  # Modules to skip when loading checkpoint
+    # Other settings
+    log_mel_samples: int = 10
+    use_torch_compile: bool = True
+class KanadePipeline(L.LightningModule):
+    """LightningModule wrapper for KanadeModel, handling training (including GAN)."""
+    def __init__(
+        self,
+        model_config: KanadeModelConfig,
+        pipeline_config: KanadePipelineConfig,
+        ssl_feature_extractor: SSLFeatureExtractor,
+        local_encoder: Transformer,
+        local_quantizer: FiniteScalarQuantizer,
+        feature_decoder: Transformer | None,
+        global_encoder: GlobalEncoder,
+        mel_prenet: Transformer,
+        mel_decoder: Transformer,
+        mel_postnet: PostNet,
+        discriminator: SpectrogramDiscriminator | None = None,
+    ):
+        super().__init__()
+        self.config = pipeline_config
+        self.save_hyperparameters("model_config", "pipeline_config")
+        self.strict_loading = False
+        self.automatic_optimization = False
+        self.torch_compiled = False
+        # Validate components required for training
+        assert not pipeline_config.train_feature or feature_decoder is not None, (
+            "Feature decoder must be provided if training feature reconstruction"
+        )
+        logger.info(
+            f"Training configuration: train_feature={pipeline_config.train_feature}, train_mel={pipeline_config.train_mel}"
+        )
+        # 1. Kanade model
+        self.model = KanadeModel(
+            config=model_config,
+            ssl_feature_extractor=ssl_feature_extractor,
+            local_encoder=local_encoder,
+            local_quantizer=local_quantizer,
+            feature_decoder=feature_decoder,
+            global_encoder=global_encoder,
+            mel_decoder=mel_decoder,
+            mel_prenet=mel_prenet,
+            mel_postnet=mel_postnet,
+        )
+        self._freeze_unused_modules(pipeline_config.train_feature, pipeline_config.train_mel)
+        # Calculate padding for expected SSL output length
+        self.padding = self.model._calculate_waveform_padding(pipeline_config.audio_length)
+        logger.info(f"Input waveform padding for SSL feature extractor: {self.padding} samples")
+        # Calculate target mel spectrogram length
+        self.target_mel_length = self.model._calculate_target_mel_length(pipeline_config.audio_length)
+        logger.info(f"Target mel spectrogram length: {self.target_mel_length} frames")
+        # 2. Discriminator
+        self._init_discriminator(pipeline_config, discriminator)
+        # 3. Mel spectrogram feature extractor for loss computation
+        if pipeline_config.train_mel:
+            self.mel_spec = MelSpectrogramFeature(
+                sample_rate=model_config.sample_rate,
+                n_fft=model_config.n_fft,
+                hop_length=model_config.hop_length,
+                n_mels=model_config.n_mels,
+                padding=model_config.padding,
+                fmin=model_config.mel_fmin,
+                fmax=model_config.mel_fmax,
+                bigvgan_style_mel=model_config.bigvgan_style_mel,
+            )
+        # Mel sample storage for logging
+        self.vocoder = None
+        self.validation_examples = []
+        self.log_mel_samples = pipeline_config.log_mel_samples
+    def _freeze_unused_modules(self, train_feature: bool, train_mel: bool):
+        model = self.model
+        if not train_feature:
+            # Freeze local branch components if not training feature reconstruction
+            freeze_modules([model.local_encoder, model.local_quantizer, model.feature_decoder])
+            if model.conv_downsample is not None:
+                freeze_modules([model.conv_downsample, model.conv_upsample])
+            logger.info("Feature reconstruction branch frozen: local_encoder, local_quantizer, feature_decoder")
+        if not train_mel:
+            # Freeze global branch and mel generation components if not training mel generation
+            freeze_modules(
+                [model.global_encoder, model.mel_prenet, model.mel_conv_upsample, model.mel_decoder, model.mel_postnet]
+            )
+            logger.info(
+                "Mel generation branch frozen: global_encoder, mel_prenet, mel_conv_upsample, mel_decoder, mel_postnet"
+            )
+    def _init_discriminator(self, config: KanadePipelineConfig, discriminator: SpectrogramDiscriminator | None):
+        # Setup discriminator if provided
+        self.discriminator = discriminator
+        self.use_discriminator = config.use_discriminator and discriminator is not None and config.train_mel
+        if config.use_discriminator and discriminator is None:
+            logger.error(
+                "Discriminator is enabled in config but no discriminator model provided. Disabling GAN training."
+            )
+        if config.use_discriminator and discriminator is not None and not config.train_mel:
+            logger.warning(
+                "Discriminator is enabled but train_mel=False. Discriminator will not be effective without mel training."
+            )
+        self.discriminator_start_step = config.discriminator_start_step
+        self.discriminator_update_prob = config.discriminator_update_prob
+        if self.use_discriminator:
+            logger.info("Discriminator initialized for GAN training")
+            logger.info(f"Discriminator start step: {self.discriminator_start_step}")
+            logger.info(f"Discriminator update probability: {self.discriminator_update_prob}")
+    def setup(self, stage: str):
+        # Torch compile model if enabled
+        if torch.__version__ >= "2.0" and self.config.use_torch_compile:
+            self.model = torch.compile(self.model)
+            if self.discriminator is not None:
+                self.discriminator = torch.compile(self.discriminator)
+            self.torch_compiled = True
+        # Load checkpoint if provided
+        if self.config.ckpt_path:
+            ckpt_path = self.config.ckpt_path
+            # Download weights from HuggingFace Hub if needed
+            if ckpt_path.startswith("hf:"):
+                from huggingface_hub import hf_hub_download
+                repo_id = ckpt_path[len("hf:") :]
+                # Separate out revision if specified
+                revision = None
+                if "@" in repo_id:
+                    repo_id, revision = repo_id.split("@", 1)
+                ckpt_path = hf_hub_download(repo_id, filename="model.safetensors", revision=revision)
+            self._load_weights(ckpt_path)
+    def forward(self, waveform: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict]:
+        """
+        Returns:
+            ssl_real: Extracted SSL features for local branch (B, T, C)
+            ssl_recon: Reconstructed SSL features (B, T, C) - only if train_feature=True
+            mel_recon: Generated mel spectrogram (B, n_mels, T) - only if train_mel=True
+            loss_dict: Dictionary with auxiliary information (codes, losses, etc.)
+        """
+        loss_dict = {}
+        # 1. Extract SSL features
+        local_ssl_features, global_ssl_features = self.model.forward_ssl_features(waveform, padding=self.padding)
+        # 2. Content branch processing
+        content_embeddings, _, ssl_recon, perplexity = self.model.forward_content(local_ssl_features)
+        loss_dict["local/perplexity"] = perplexity
+        # 3. Global branch processing and mel reconstruction
+        mel_recon = None
+        if self.config.train_mel:
+            global_embeddings = self.model.forward_global(global_ssl_features)
+            mel_recon = self.model.forward_mel(content_embeddings, global_embeddings, mel_length=self.target_mel_length)
+        return local_ssl_features, ssl_recon, mel_recon, loss_dict
+    def _get_reconstruction_loss(
+        self, audio_real: torch.Tensor, ssl_real: torch.Tensor, ssl_recon: torch.Tensor, mel_recon: torch.Tensor
+    ) -> tuple[torch.Tensor, dict, torch.Tensor]:
+        """Compute L1 + L2 loss for SSL feature and mel spectrogram reconstruction.
+        Returns:
+            total_loss: Combined reconstruction loss
+            loss_dict: Dictionary with individual loss components
+            mel_real: Real mel spectrogram for reference
+        """
+        if audio_real.dim() == 3:
+            audio_real = audio_real.squeeze(1)
+        loss_dict = {}
+        feature_loss, mel_loss = 0, 0
+        # Compute SSL feature reconstruction losses if training features
+        if self.config.train_feature and self.model.feature_decoder is not None:
+            assert ssl_real is not None and ssl_recon is not None, (
+                "SSL features must be provided for training feature reconstruction"
+            )
+            ssl_l1 = F.l1_loss(ssl_recon, ssl_real)
+            ssl_l2 = F.mse_loss(ssl_recon, ssl_real)
+            feature_loss = self.config.feature_l1_weight * ssl_l1 + self.config.feature_l2_weight * ssl_l2
+            loss_dict.update({"ssl_l1": ssl_l1, "ssl_l2": ssl_l2, "feature_loss": feature_loss})
+        # Compute mel spectrogram reconstruction losses if training mel
+        mel_real = None
+        if self.config.train_mel:
+            assert mel_recon is not None, "Mel reconstruction must be provided for training mel generation"
+            # Extract reference mel spectrogram from audio
+            mel_real = self.mel_spec(audio_real)
+            mel_l1 = F.l1_loss(mel_recon, mel_real)
+            mel_l2 = F.mse_loss(mel_recon, mel_real)
+            mel_loss = self.config.mel_l1_weight * mel_l1 + self.config.mel_l2_weight * mel_l2
+            loss_dict.update({"mel_l1": mel_l1, "mel_l2": mel_l2, "mel_loss": mel_loss})
+        total_loss = feature_loss + mel_loss
+        return total_loss, loss_dict, mel_real
+    def _get_discriminator_loss(
+        self, real_outputs: torch.Tensor, fake_outputs: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute the adversarial loss for discriminator.
+        Returns:
+            disc_loss: Total discriminator loss
+            real_loss: Loss component from real samples
+            fake_loss: Loss component from fake samples
+        """
+        if self.config.adv_loss_type == "hinge":
+            real_loss = torch.mean(torch.clamp(1 - real_outputs, min=0))
+            fake_loss = torch.mean(torch.clamp(1 + fake_outputs, min=0))
+        elif self.config.adv_loss_type == "least_square":
+            real_loss = torch.mean((real_outputs - 1) ** 2)
+            fake_loss = torch.mean(fake_outputs**2)
+        else:
+            raise ValueError(f"Unknown adversarial loss type: {self.config.adv_loss_type}")
+        disc_loss = real_loss + fake_loss
+        return disc_loss, real_loss, fake_loss
+    def _get_generator_loss(self, fake_outputs: torch.Tensor) -> torch.Tensor:
+        """Compute the adversarial loss for generator."""
+        if self.config.adv_loss_type == "hinge":
+            return torch.mean(torch.clamp(1 - fake_outputs, min=0))
+        elif self.config.adv_loss_type == "least_square":
+            return torch.mean((fake_outputs - 1) ** 2)
+        else:
+            raise ValueError(f"Unknown adversarial loss type: {self.config.adv_loss_type}")
+    def _get_feature_matching_loss(
+        self, real_intermediates: list[torch.Tensor], fake_intermediates: list[torch.Tensor]
+    ) -> torch.Tensor:
+        losses = []
+        for real_feat, fake_feat in zip(real_intermediates, fake_intermediates):
+            losses.append(torch.mean(torch.abs(real_feat.detach() - fake_feat)))
+        fm_loss = torch.mean(torch.stack(losses))
+        return fm_loss
+    def _discriminator_step(
+        self, batch: AudioBatch, optimizer_disc: torch.optim.Optimizer
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict[str, torch.Tensor], list[torch.Tensor]]:
+        """
+        Returns:
+            ssl_real: Real SSL features
+            ssl_recon: Reconstructed SSL features from generator
+            mel_recon: Generated mel spectrogram
+            loss_dict: Dictionary with auxiliary information
+            real_intermediates: Intermediate feature maps from discriminator for real mel
+        """
+        assert self.use_discriminator, "Discriminator step called but discriminator is not enabled"
+        ssl_real, ssl_recon, mel_recon, loss_dict = self(batch.waveform)
+        assert mel_recon is not None, "Mel reconstruction must be available for discriminator step"
+        # Get true mel spectrogram (always use original waveform)
+        mel_real = self.mel_spec(batch.waveform)
+        # Get discriminator outputs and intermediates for real mel
+        real_outputs, real_intermediates = self.discriminator(mel_real)
+        fake_outputs, _ = self.discriminator(mel_recon.detach())
+        # Compute discriminator loss
+        disc_loss, real_loss, fake_loss = self._get_discriminator_loss(real_outputs, fake_outputs)
+        # Log discriminator losses
+        batch_size = batch.waveform.size(0)
+        self.log("train/disc/real", real_loss, batch_size=batch_size)
+        self.log("train/disc/fake", fake_loss, batch_size=batch_size)
+        self.log("train/disc/loss", disc_loss, batch_size=batch_size, prog_bar=True)
+        for name, value in loss_dict.items():
+            self.log(f"train/{name}", value, batch_size=batch_size)
+        # Optimize discriminator
+        optimizer_disc.zero_grad()
+        self.manual_backward(disc_loss)
+        # Log gradient norm
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            self.discriminator.parameters(), max_norm=self.config.gradient_clip_val or torch.inf
+        )
+        self.log("train/disc/grad_norm", grad_norm, batch_size=batch_size)
+        optimizer_disc.step()
+        return ssl_real, ssl_recon, mel_recon, loss_dict, real_intermediates
+    def _generator_step(
+        self,
+        batch: AudioBatch,
+        optimizer_gen: torch.optim.Optimizer,
+        ssl_real: torch.Tensor | None = None,
+        ssl_recon: torch.Tensor | None = None,
+        mel_recon: torch.Tensor | None = None,
+        loss_dict: dict | None = None,
+        real_intermediates: list[torch.Tensor] | None = None,
+        training_disc: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            batch: Audio batch with waveform and augmented_waveform
+            optimizer_gen: Generator optimizer
+            ssl_real: Real SSL features (optional)
+            ssl_recon: Reconstructed SSL features (optional)
+            mel_recon: Generated mel spectrogram (optional)
+            loss_dict: Dictionary with auxiliary information (optional)
+            real_intermediates: Intermediate feature maps from discriminator for real mel (optional)
+            training_disc: Whether discriminator is being trained in this step
+        Returns:
+            gen_loss: Total generator loss
+        """
+        # Forward pass through the model if not already done in discriminator step
+        if loss_dict is None:
+            ssl_real, ssl_recon, mel_recon, loss_dict = self(batch.waveform)
+        # Compute reconstruction loss (always use original waveform for mel target)
+        recon_loss, recon_dict, mel_real = self._get_reconstruction_loss(batch.waveform, ssl_real, ssl_recon, mel_recon)
+        gen_loss = recon_loss
+        # Compute adversarial and feature matching losses if using discriminator
+        batch_size = batch.waveform.size(0)
+        if training_disc:
+            assert mel_real is not None and mel_recon is not None, "Mel spectrograms must be provided for GAN training"
+            if real_intermediates is None:
+                _, real_intermediates = self.discriminator(mel_real)
+            fake_outputs, fake_intermediates = self.discriminator(mel_recon)
+            # Compute adversarial loss
+            adv_loss = self._get_generator_loss(fake_outputs)
+            gen_loss += self.config.adv_weight * adv_loss
+            self.log("train/gen/adv_loss", adv_loss, batch_size=batch_size)
+            # Compute feature matching loss
+            feature_matching_loss = self._get_feature_matching_loss(real_intermediates, fake_intermediates)
+            gen_loss += self.config.feature_matching_weight * feature_matching_loss
+            self.log("train/gen/feature_matching_loss", feature_matching_loss, batch_size=batch_size)
+        # Log reconstruction losses
+        for name, value in loss_dict.items():
+            self.log(f"train/{name}", value, batch_size=batch_size)
+        for name, value in recon_dict.items():
+            self.log(f"train/gen/{name}", value, batch_size=batch_size)
+        self.log("train/loss", gen_loss, batch_size=batch_size, prog_bar=True)
+        # Optimize generator
+        optimizer_gen.zero_grad()
+        self.manual_backward(gen_loss)
+        # Log gradient norm
+        grad_norm = torch.nn.utils.clip_grad_norm_(
+            self.model.parameters(), max_norm=self.config.gradient_clip_val or torch.inf
+        )
+        self.log("train/gen/grad_norm", grad_norm, batch_size=batch_size)
+        optimizer_gen.step()
+        return gen_loss
+    def training_step(self, batch: AudioBatch, batch_idx: int):
+        if self.use_discriminator:
+            optimizer_disc, optimizer_gen = self.optimizers()
+            scheduler_disc, scheduler_gen = self.lr_schedulers()
+        else:
+            optimizer_gen = self.optimizers()
+            scheduler_gen = self.lr_schedulers()
+        # Determine if discriminator should be trained in this step
+        training_disc = (
+            self.use_discriminator
+            and self.global_step >= self.discriminator_start_step
+            and torch.rand(1).item() < self.discriminator_update_prob
+        )
+        if self.global_step == self.discriminator_start_step and self.use_discriminator:
+            logger.info(f"Discriminator training starts at step {self.global_step}")
+        ssl_real, ssl_recon, mel_recon, loss_dict, real_intermediates = None, None, None, None, None
+        # Train discriminator if conditions are met
+        if training_disc:
+            ssl_real, ssl_recon, mel_recon, loss_dict, real_intermediates = self._discriminator_step(
+                batch, optimizer_disc
+            )
+            scheduler_disc.step()
+        elif self.use_discriminator:
+            # Step the discriminator scheduler even when not training discriminator
+            scheduler_disc.step()
+        # Train generator
+        self._generator_step(
+            batch, optimizer_gen, ssl_real, ssl_recon, mel_recon, loss_dict, real_intermediates, training_disc
+        )
+        scheduler_gen.step()
+    def validation_step(self, batch: AudioBatch, batch_idx: int):
+        audio_real = batch.waveform
+        ssl_real, ssl_recon, mel_recon, loss_dict = self(audio_real)
+        # Convert to waveform using vocoder for logging
+        batch_size = audio_real.size(0)
+        # Compute reconstruction loss
+        recon_loss, recon_dict, mel_real = self._get_reconstruction_loss(audio_real, ssl_real, ssl_recon, mel_recon)
+        gen_loss = recon_loss
+        # Log reconstruction losses
+        for name, value in loss_dict.items():
+            self.log(f"val/{name}", value, batch_size=batch_size)
+        for name, value in recon_dict.items():
+            self.log(f"val/gen/{name}", value, batch_size=batch_size)
+        self.log("val/loss", gen_loss, batch_size=batch_size)
+        # Save first few samples for visualization at end of epoch if training mel generation
+        if self.config.train_mel and len(self.validation_examples) < self.log_mel_samples:
+            assert mel_real is not None and mel_recon is not None, (
+                "Mel spectrograms must be provided for validation logging"
+            )
+            audio_real = audio_real[0].cpu()
+            audio_gen = None
+            if self.vocoder is not None:
+                audio_gen = self.vocode(mel_recon[0:1])[0].cpu()
+            self.validation_examples.append((mel_real[0].cpu(), mel_recon[0].detach().cpu(), audio_real, audio_gen))
+    def predict_step(self, batch: AudioBatch, batch_idx: int):
+        audio_real = batch.waveform
+        _, _, mel_gen, _ = self(audio_real)
+        audio_gen = self.vocode(mel_gen)
+        if audio_gen.dim() == 2:
+            audio_gen = audio_gen.unsqueeze(1)
+        return {"audio_ids": batch.audio_ids, "audio_real": audio_real, "audio_gen": audio_gen}
+    def configure_optimizers(self):
+        # Generator optimizer
+        optimizer_gen = AdamW(
+            self.model.parameters(), lr=self.config.lr, betas=self.config.betas, weight_decay=self.config.weight_decay
+        )
+        # Generator LR scheduler
+        scheduler_gen = OneCycleLR(
+            optimizer_gen,
+            max_lr=self.config.lr,
+            div_factor=self.config.lr_div_factor,
+            final_div_factor=self.config.lr_final_div_factor,
+            pct_start=self.config.warmup_percent,
+            anneal_strategy=self.config.anneal_mode,
+            total_steps=self.trainer.estimated_stepping_batches,
+        )
+        if not self.use_discriminator:
+            return ([optimizer_gen], [{"scheduler": scheduler_gen, "interval": "step"}])
+        # If using discriminator, also configure discriminator optimizer and scheduler
+        optimizer_disc = AdamW(
+            self.discriminator.parameters(),
+            lr=self.config.discriminator_lr or self.config.lr,
+            betas=self.config.betas,
+            weight_decay=self.config.weight_decay,
+        )
+        # Discriminator LR scheduler
+        scheduler_disc = OneCycleLR(
+            optimizer_disc,
+            max_lr=self.config.discriminator_lr or self.config.lr,
+            div_factor=self.config.lr_div_factor,
+            final_div_factor=self.config.lr_final_div_factor,
+            pct_start=self.config.warmup_percent,
+            anneal_strategy=self.config.anneal_mode,
+            total_steps=self.trainer.estimated_stepping_batches,
+        )
+        # Load optimizer state
+        if self.config.ckpt_path:
+            if self.config.ckpt_path.endswith(".ckpt"):
+                checkpoint = torch.load(self.config.ckpt_path)
+                optimizer_states = checkpoint["optimizer_states"]
+                if len(optimizer_states) > 1 and self.use_discriminator:
+                    optimizer_disc.load_state_dict(optimizer_states[0])
+                    optimizer_gen.load_state_dict(optimizer_states[1])
+                    logger.info("Loaded discriminator and generator's optimizer states from checkpoint")
+                elif len(optimizer_states) == 1 and not self.use_discriminator:
+                    # Load generator optimizer state only
+                    optimizer_gen.load_state_dict(optimizer_states[0])
+                    logger.info("Loaded generator's optimizer state from checkpoint")
+            else:
+                logger.info("No optimizer state loaded since checkpoint is not a .ckpt file")
+        return (
+            [optimizer_disc, optimizer_gen],
+            [{"scheduler": scheduler_disc, "interval": "step"}, {"scheduler": scheduler_gen, "interval": "step"}],
+        )
+    def _setup_vocoder(self):
+        try:
+            return load_vocoder(name=self.model.config.vocoder_name)
+        except ImportError:
+            logger.error("Vocoder could not be loaded. Please install the required dependencies.")
+            return None
+    def vocode(self, mel: torch.Tensor) -> torch.Tensor:
+        self.vocoder = self.vocoder.to(mel.device)
+        waveform = vocode(self.vocoder, mel)
+        return waveform.cpu().float()
+    def on_validation_start(self):
+        self.vocoder = self._setup_vocoder()
+    def on_predict_start(self):
+        self.vocoder = self._setup_vocoder()
+    def on_validation_end(self):
+        if len(self.validation_examples) > 0:
+            for i, (mel_real, mel_recon, audio_real, audio_gen) in enumerate(self.validation_examples):
+                # Log spectrograms
+                fig_real = self._get_spectrogram_plot(mel_real)
+                fig_gen = self._get_spectrogram_plot(mel_recon)
+                self._log_figure(f"val/{i}_mel_real", fig_real)
+                self._log_figure(f"val/{i}_mel_gen", fig_gen)
+                # Log audio samples
+                if audio_gen is not None:
+                    audio_real = audio_real.cpu().numpy()
+                    audio_gen = audio_gen.cpu().numpy()
+                    self._log_audio(f"val/{i}_audio_real", audio_real)
+                    self._log_audio(f"val/{i}_audio_gen", audio_gen)
+            self.validation_examples = []
+        # Clear vocoder to free memory
+        self.vocoder = None
+    def _log_figure(self, tag: str, fig):
+        """Log a matplotlib figure to the logger."""
+        if isinstance(self.logger, TensorBoardLogger):
+            self.logger.experiment.add_figure(tag, fig, self.global_step)
+        elif isinstance(self.logger, WandbLogger):
+            import PIL.Image as Image
+            fig.canvas.draw()
+            image = Image.frombytes("RGBa", fig.canvas.get_width_height(), fig.canvas.buffer_rgba())
+            image = image.convert("RGB")
+            self.logger.log_image(tag, [image], step=self.global_step)
+    def _log_audio(self, tag: str, audio: np.ndarray):
+        """Log an audio sample to the logger."""
+        if isinstance(self.logger, TensorBoardLogger):
+            self.logger.experiment.add_audio(tag, audio, self.global_step, sample_rate=self.model.config.sample_rate)
+        elif isinstance(self.logger, WandbLogger):
+            self.logger.log_audio(
+                tag, [audio.flatten()], sample_rate=[self.model.config.sample_rate], step=self.global_step
+            )
+    def _get_spectrogram_plot(self, mel: torch.Tensor):
+        from matplotlib import pyplot as plt
+        mel = mel.detach().cpu().numpy()
+        fig, ax = plt.subplots(figsize=(10, 4))
+        im = ax.imshow(mel, aspect="auto", origin="lower", cmap="magma", vmin=-8.0, vmax=5.0)
+        fig.colorbar(im, ax=ax)
+        ax.set_ylabel("Mel bins")
+        ax.set_xlabel("Time steps")
+        fig.tight_layout()
+        return fig
+    def _load_weights(self, ckpt_path: str | None, model_state_dict: dict[str, torch.Tensor] | None = None):
+        """Load model and discriminator weights from checkpoint. Supports .ckpt (Lightning), .safetensors, .pt/.pth formats.
+        If model_state_dict is provided, load weights from it instead of ckpt_path."""
+        def select_keys(state_dict: dict, prefix: str) -> dict:
+            """Select keys from state_dict that start with the given prefix. Remove the prefix from keys."""
+            return {k[len(prefix) :]: v for k, v in state_dict.items() if k.startswith(prefix)}
+        def remove_prefix(state_dict: dict, prefix: str) -> dict:
+            """Remove a prefix from keys that start with that prefix."""
+            return {k[len(prefix) :] if k.startswith(prefix) else k: v for k, v in state_dict.items()}
+        def add_prefix(state_dict: dict, prefix: str) -> dict:
+            """Add a prefix to keys that do not start with that prefix."""
+            return {f"{prefix}{k}" if not k.startswith(prefix) else k: v for k, v in state_dict.items()}
+        # Load state dict
+        if model_state_dict is not None:
+            # Load from provided state dict
+            disc_state_dict = {}
+        elif ckpt_path.endswith(".ckpt"):
+            # Lightning checkpoint
+            checkpoint = torch.load(ckpt_path, map_location="cpu")
+            model_state_dict = select_keys(checkpoint["state_dict"], "model.")
+            disc_state_dict = select_keys(checkpoint["state_dict"], "discriminator.")
+        elif ckpt_path.endswith(".safetensors"):
+            # Safetensors checkpoint
+            from safetensors.torch import load_file
+            checkpoint = load_file(ckpt_path, device="cpu")
+            model_state_dict = checkpoint
+            disc_state_dict = {}
+        elif ckpt_path.endswith(".pt") or ckpt_path.endswith(".pth"):
+            # Standard PyTorch checkpoint
+            checkpoint = torch.load(ckpt_path, map_location="cpu")
+            model_state_dict = checkpoint
+            disc_state_dict = {}
+        else:
+            raise ValueError(f"Unsupported checkpoint format: {ckpt_path}")
+        # Load model weights
+        model_state_dict = remove_prefix(model_state_dict, "_orig_mod.")
+        model_state_dict = {
+            k: v
+            for k, v in model_state_dict.items()
+            if not any(k.startswith(module) for module in self.config.skip_loading_modules)
+        }
+        if self.torch_compiled:
+            model_state_dict = add_prefix(model_state_dict, "_orig_mod.")
+        if len(model_state_dict) > 0:
+            result = self.model.load_state_dict(model_state_dict, strict=False)
+            logger.info(f"Loaded model weights from {ckpt_path or 'provided state_dict'}.")
+            if result.missing_keys:
+                logger.debug(f"Missing keys in model state_dict: {result.missing_keys}")
+            if result.unexpected_keys:
+                logger.debug(f"Unexpected keys in model state_dict: {result.unexpected_keys}")
+        # Load discriminator weights if available
+        if self.use_discriminator:
+            disc_state_dict = remove_prefix(disc_state_dict, "_orig_mod.")
+            if self.torch_compiled:
+                disc_state_dict = add_prefix(disc_state_dict, "_orig_mod.")
+            if len(disc_state_dict) > 0:
+                result = self.discriminator.load_state_dict(disc_state_dict, strict=False)
+                logger.info(f"Loaded discriminator weights from {ckpt_path}.")
+                if result.missing_keys:
+                    logger.debug(f"Missing keys in discriminator state_dict: {result.missing_keys}")
+                if result.unexpected_keys:
+                    logger.debug(f"Unexpected keys in discriminator state_dict: {result.unexpected_keys}")
+    @classmethod
+    def from_hparams(cls, config_path: str) -> "KanadePipeline":
+        """Instantiate KanadePipeline from config file.
+        Args:
+            config_path (str): Path to model configuration file (.yaml).
+        Returns:
+            KanadePipeline: Instantiated KanadePipeline.
+        """
+        # Load config
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        # Remove related fields to prevent loading actual weights here
+        new_config = {"model": config["model"]}
+        pipeline_config = new_config["model"]["init_args"]["pipeline_config"]
+        if "ckpt_path" in pipeline_config:
+            del pipeline_config["ckpt_path"]
+        if "skip_loading_modules" in pipeline_config:
+            del pipeline_config["skip_loading_modules"]
+        # Instantiate model using jsonargparse
+        parser = jsonargparse.ArgumentParser(exit_on_error=False)
+        parser.add_argument("--model", type=KanadePipeline)
+        cfg = parser.parse_object(new_config)
+        cfg = parser.instantiate_classes(cfg)
+        return cfg.model
+    @staticmethod
+    def from_pretrained(config_path: str, ckpt_path: str) -> "KanadePipeline":
+        """Load KanadePipeline from training configuration and checkpoint files.
+        Args:
+            config_path: Path to pipeline configuration file (YAML).
+            ckpt_path: Path to checkpoint file (.ckpt) or model weights (.safetensors).
+        Returns:
+            KanadePipeline: Instantied KanadePipeline with loaded weights.
+        """
+        # Load pipeline from config
+        model = KanadePipeline.from_hparams(config_path)
+        # Load the weights
+        model._load_weights(ckpt_path)
+        return model

src/kanade_tokenizer/util.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import logging
+from typing import Literal
+import torch
+import torch.nn as nn
+# Configure logger
+logger = logging.getLogger("kanade_tokenizer")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+handler.setFormatter(logging.Formatter("[%(asctime)s] %(levelname)s %(name)s: %(message)s"))
+logger.addHandler(handler)
+def get_logger() -> logging.Logger:
+    return logger
+def freeze_modules(modules: list[nn.Module] | None):
+    for module in modules:
+        if module is not None:
+            for param in module.parameters():
+                param.requires_grad = False
+def _load_audio_internal(
+    path: str, frame_offset: int | None = None, num_frames: int | None = None
+) -> tuple[torch.Tensor, int]:
+    # TorchAudio >= 2.9.0 removed decoding and encoding capabilities to TorchCodec.
+    # See: https://github.com/pytorch/audio/issues/3902
+    # waveform, sample_rate = torchaudio.load(path, frame_offset=frame_offset or 0, num_frames=num_frames or -1)
+    import soundfile as sf
+    with sf.SoundFile(path) as f:
+        if frame_offset is not None:
+            f.seek(frame_offset)
+        frames = f.read(frames=num_frames or -1, dtype="float32", always_2d=True)
+        waveform = torch.from_numpy(frames.T)
+        sample_rate = f.samplerate
+    return waveform, sample_rate
+def load_audio(audio_path: str, sample_rate: int = 24000) -> torch.Tensor:
+    import torchaudio
+    """Load and preprocess audio file."""
+    waveform, sr = _load_audio_internal(audio_path)
+    # Convert to mono if stereo
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    # Resample if necessary
+    if sr != sample_rate:
+        resampler = torchaudio.transforms.Resample(sr, sample_rate)
+        waveform = resampler(waveform)
+    # Normalize waveform
+    max_val = torch.max(torch.abs(waveform)) + 1e-8
+    waveform = waveform / max_val  # Normalize to [-1, 1]
+    return waveform.squeeze(0)  # Remove channel dimension
+def load_vocoder(name: Literal["vocos", "hift"] = "vocos") -> torch.nn.Module:
+    if name == "vocos":
+        from vocos import Vocos
+        model = Vocos.from_pretrained("charactr/vocos-mel-24khz")
+        model = model.eval()
+        return model
+    elif name == "hift":
+        from huggingface_hub import hf_hub_download
+        from .module.hift import HiFTGenerator
+        # Download hte HiFT model from FunAudioLLM/CosyVoice2-0.5B
+        model_path = hf_hub_download(repo_id="FunAudioLLM/CosyVoice2-0.5B", filename="hift.pt")
+        model = HiFTGenerator()
+        model.load_weights(model_path)
+        model = model.eval()
+        return model
+    else:
+        raise ValueError(f"Unsupported vocoder name: {name}")
+def vocode(vocoder, mel_spectrogram: torch.Tensor) -> torch.Tensor:
+    """Convert mel spectrogram to waveform using Vocos vocoder.
+    Args:
+        vocoder: Pretrained vocoder model.
+        mel_spectrogram (torch.Tensor): Input mel spectrogram tensor (..., n_mels, frame).
+    Returns:
+        torch.Tensor: Generated audio waveform tensor (..., samples).
+    """
+    mel_spectrogram = mel_spectrogram.to(torch.float32)  # Ensure mel spectrogram is in float32
+    vocoder_class_name = vocoder.__class__.__name__
+    if "Vocos" in vocoder_class_name:
+        generated_waveform = vocoder.decode(mel_spectrogram)
+    elif "HiFT" in vocoder_class_name:
+        generated_waveform = vocoder.inference(mel_spectrogram)
+    else:
+        raise ValueError(f"Unsupported vocoder class: {vocoder_class_name}")
+    return generated_waveform