Spaces:

cosrigel
/

Dia-Vietnamese

Running

App Files Files Community

cosrigel commited on Aug 13

Commit

6d342f3

verified ·

1 Parent(s): 64b4d26

chore: upload folder dia for inference

Browse files

Files changed (11) hide show

dia/__init__.py +0 -0
dia/audio.py +286 -0
dia/config.json +49 -0
dia/config.py +206 -0
dia/config_inference.json +49 -0
dia/convert_ckpt.py +54 -0
dia/dataset.py +162 -0
dia/finetune.py +787 -0
dia/interleaved_datasets.py +144 -0
dia/layers.py +909 -0
dia/model.py +648 -0

dia/__init__.py ADDED Viewed

File without changes

dia/audio.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import typing as tp
+import torch
+from .config import DataConfig
+def build_delay_indices(B: int, T: int, C: int, delay_pattern: tp.List[int]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Precompute (t_idx_BxTxC, indices_BTCx3) so that out[t, c] = in[t - delay[c], c].
+    Negative t_idx => BOS; t_idx >= T => PAD.
+    """
+    delay_arr = torch.tensor(delay_pattern, dtype=torch.int32)
+    t_idx_BxT = torch.broadcast_to(
+        torch.arange(T, dtype=torch.int32)[None, :],
+        [B, T],
+    )
+    t_idx_BxTx1 = t_idx_BxT[..., None]
+    t_idx_BxTxC = t_idx_BxTx1 - delay_arr.view(1, 1, C)
+    b_idx_BxTxC = torch.broadcast_to(
+        torch.arange(B, dtype=torch.int32).view(B, 1, 1),
+        [B, T, C],
+    )
+    c_idx_BxTxC = torch.broadcast_to(
+        torch.arange(C, dtype=torch.int32).view(1, 1, C),
+        [B, T, C],
+    )
+    # We must clamp time indices to [0..T-1] so gather_nd equivalent won't fail
+    t_clamped_BxTxC = torch.clamp(t_idx_BxTxC, 0, T - 1)
+    indices_BTCx3 = torch.stack(
+        [
+            b_idx_BxTxC.reshape(-1),
+            t_clamped_BxTxC.reshape(-1),
+            c_idx_BxTxC.reshape(-1),
+        ],
+        dim=1,
+    ).long()  # Ensure indices are long type for indexing
+    return t_idx_BxTxC, indices_BTCx3
+def apply_audio_delay(
+    audio_BxTxC: torch.Tensor,
+    pad_value: int,
+    bos_value: int,
+    precomp: tp.Tuple[torch.Tensor, torch.Tensor],
+) -> torch.Tensor:
+    """
+    Applies the delay pattern to batched audio tokens using precomputed indices,
+    inserting BOS where t_idx < 0 and PAD where t_idx >= T.
+    Args:
+        audio_BxTxC: [B, T, C] int16 audio tokens (or int32/float)
+        pad_value: the padding token
+        bos_value: the BOS token
+        precomp:  (t_idx_BxTxC, indices_BTCx3) from build_delay_indices
+    Returns:
+        result_BxTxC: [B, T, C] delayed audio tokens
+    """
+    device = audio_BxTxC.device  # Get device from input tensor
+    t_idx_BxTxC, indices_BTCx3 = precomp
+    t_idx_BxTxC = t_idx_BxTxC.to(device)  # Move precomputed indices to device
+    indices_BTCx3 = indices_BTCx3.to(device)
+    # Equivalent of tf.gather_nd using advanced indexing
+    # Ensure indices are long type if not already (build_delay_indices should handle this)
+    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
+    gathered_BxTxC = gathered_flat.view(audio_BxTxC.shape)
+    # Create masks on the correct device
+    mask_bos = t_idx_BxTxC < 0  # => place bos_value
+    mask_pad = t_idx_BxTxC >= audio_BxTxC.shape[1]  # => place pad_value
+    # Create scalar tensors on the correct device
+    bos_tensor = torch.tensor(bos_value, dtype=audio_BxTxC.dtype, device=device)
+    pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
+    # If mask_bos, BOS; else if mask_pad, PAD; else original gather
+    # All tensors should now be on the same device
+    result_BxTxC = torch.where(mask_bos, bos_tensor, torch.where(mask_pad, pad_tensor, gathered_BxTxC))
+    return result_BxTxC
+@torch.no_grad()
+@torch.inference_mode()
+def audio_to_codebook(
+    model,
+    input_values,
+    data_config: DataConfig,
+    padding_mask=None,
+    sample_rate=44100,
+):
+    """
+    Encodes the input audio waveform into discrete codes.
+    Args:
+        model: The model to use for encoding.
+        input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+            Float values of the input audio waveform.
+        padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
+            Padding mask used to pad the `input_values`.
+        sample_rate (`int`, *optional*) :
+            Signal sampling_rate
+    Returns:
+        A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
+        factors for each chunk when `normalize` is True. Each frames is a tuple `(codebook, scale)`, with
+        `codebook` of shape `[batch_size, num_codebooks, frames]`.
+        Scale is not used here.
+    """
+    audio_data = model.preprocess(input_values, sample_rate)
+    if padding_mask is None:
+        padding_mask = torch.ones_like(input_values).bool()
+    _, encoded_frame, _, _, _ = model.encode(audio_data, n_quantizers=None)  # 1, C, T
+    seq_length = encoded_frame.shape[2]
+    t_idx_BxTxC, indices_BTCx3 = build_delay_indices(
+        B=1,
+        T=seq_length,
+        C=data_config.channels,
+        delay_pattern=data_config.delay_pattern,
+    )
+    encoded_frame = apply_audio_delay(
+        audio_BxTxC=encoded_frame.transpose(1, 2),  # 1, T, C
+        pad_value=data_config.audio_pad_value,
+        bos_value=data_config.audio_bos_value,
+        precomp=(t_idx_BxTxC, indices_BTCx3),
+    )
+    return encoded_frame
+def build_revert_indices(B: int, T: int, C: int, delay_pattern: tp.List[int]) -> tp.Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Precompute indices for the revert operation using PyTorch.
+    Returns:
+        A tuple (t_idx_BxTxC, indices_BTCx3) where:
+            - t_idx_BxTxC is a tensor of shape [B, T, C] computed as time indices plus the delay.
+            - indices_BTCx3 is a tensor of shape [B*T*C, 3] used for gathering, computed from:
+                batch indices, clamped time indices, and channel indices.
+    """
+    # Use default device unless specified otherwise; assumes inputs might define device later
+    device = None  # Or determine dynamically if needed, e.g., from a model parameter
+    delay_arr = torch.tensor(delay_pattern, dtype=torch.int32, device=device)
+    t_idx_BT1 = torch.broadcast_to(torch.arange(T, device=device).unsqueeze(0), [B, T])
+    t_idx_BT1 = t_idx_BT1.unsqueeze(-1)
+    t_idx_BxTxC = torch.minimum(
+        t_idx_BT1 + delay_arr.view(1, 1, C),
+        torch.tensor(T - 1, device=device),
+    )
+    b_idx_BxTxC = torch.broadcast_to(torch.arange(B, device=device).view(B, 1, 1), [B, T, C])
+    c_idx_BxTxC = torch.broadcast_to(torch.arange(C, device=device).view(1, 1, C), [B, T, C])
+    indices_BTCx3 = torch.stack(
+        [
+            b_idx_BxTxC.reshape(-1),
+            t_idx_BxTxC.reshape(-1),
+            c_idx_BxTxC.reshape(-1),
+        ],
+        axis=1,
+    ).long()  # Ensure indices are long type
+    return t_idx_BxTxC, indices_BTCx3
+def revert_audio_delay(
+    audio_BxTxC: torch.Tensor,
+    pad_value: int,
+    precomp: tp.Tuple[torch.Tensor, torch.Tensor],
+    T: int,
+) -> torch.Tensor:
+    """
+    Reverts a delay pattern from batched audio tokens using precomputed indices (PyTorch version).
+    Args:
+        audio_BxTxC: Input delayed audio tensor
+        pad_value: Padding value for out-of-bounds indices
+        precomp: Precomputed revert indices tuple containing:
+            - t_idx_BxTxC: Time offset indices tensor
+            - indices_BTCx3: Gather indices tensor for original audio
+        T: Original sequence length before padding
+    Returns:
+        Reverted audio tensor with same shape as input
+    """
+    t_idx_BxTxC, indices_BTCx3 = precomp
+    device = audio_BxTxC.device  # Get device from input tensor
+    # Move precomputed indices to the same device as audio_BxTxC if they aren't already
+    t_idx_BxTxC = t_idx_BxTxC.to(device)
+    indices_BTCx3 = indices_BTCx3.to(device)
+    # Using PyTorch advanced indexing (equivalent to tf.gather_nd or np equivalent)
+    gathered_flat = audio_BxTxC[indices_BTCx3[:, 0], indices_BTCx3[:, 1], indices_BTCx3[:, 2]]
+    gathered_BxTxC = gathered_flat.view(audio_BxTxC.size())  # Use .size() for robust reshaping
+    # Create pad_tensor on the correct device
+    pad_tensor = torch.tensor(pad_value, dtype=audio_BxTxC.dtype, device=device)
+    # Create T tensor on the correct device for comparison
+    T_tensor = torch.tensor(T, device=device)
+    result_BxTxC = torch.where(t_idx_BxTxC >= T_tensor, pad_tensor, gathered_BxTxC)  # Changed np.where to torch.where
+    return result_BxTxC
+@torch.no_grad()
+@torch.inference_mode()
+def decode(
+    model,
+    audio_codes,
+):
+    """
+    Decodes the given frames into an output audio waveform
+    """
+    if len(audio_codes) != 1:
+        raise ValueError(f"Expected one frame, got {len(audio_codes)}")
+    try:
+        audio_values = model.quantizer.from_codes(audio_codes)
+        audio_values = model.decode(audio_values[0])
+        return audio_values
+    except Exception as e:
+        print(f"Error in decode method: {str(e)}")
+        raise
+def codebook_to_audio(generated_codes: torch.Tensor, model, delay_pattern, B=1, T=2600, C=9):
+    """Process a single codebook file to generate audio"""
+    # Remove BOS token
+    generated_codes = generated_codes[:, 1:]
+    if generated_codes.shape[1] > T:
+        generated_codes = generated_codes[:, :T]
+    seq_length = generated_codes.shape[1]
+    # Build revert indices
+    t_idx_BxTxC, indices_BTCx3 = build_revert_indices(B=B, T=seq_length, C=C, delay_pattern=delay_pattern)
+    # Transpose and add batch dimension
+    audio_BxTxC = generated_codes.transpose(1, 0).unsqueeze(0)
+    reverted_codebook = revert_audio_delay(
+        audio_BxTxC=audio_BxTxC,
+        pad_value=0,
+        precomp=(t_idx_BxTxC, indices_BTCx3),
+        T=seq_length,
+    )
+    # Chỉ cắt bỏ 'delay' frame cuối nếu total frame > delay
+    delay = 30
+    num_frames = reverted_codebook.size(1)  # chiều T ban đầu
+    if num_frames > delay:
+        reverted_codebook = reverted_codebook[:, :-delay, :]
+    # Nếu num_frames <= delay thì không cắt để tránh tạo chiều T = 0
+    codebook = reverted_codebook.transpose(1, 2)  # (B x T x C) -> (B x C x T)
+    min_valid_index = 0
+    max_valid_index = 1023
+    invalid_mask = (codebook < min_valid_index) | (codebook > max_valid_index)
+    num_invalid = torch.sum(invalid_mask).item()
+    if num_invalid > 0:
+        print(f"Warning: Clamping {num_invalid} indices outside range [{min_valid_index}, {max_valid_index}] to 0.")
+    # Set invalid values to 0 (modify the tensor in-place)
+    codebook[invalid_mask] = 0
+    audio_array = decode(model, codebook)
+    return audio_array

dia/config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+    "version": "0.1",
+    "model": {
+        "encoder": {
+            "n_layer": 12,
+            "n_embd": 1024,
+            "n_hidden": 4096,
+            "n_head": 16,
+            "head_dim": 128
+        },
+        "decoder": {
+            "n_layer": 18,
+            "n_embd": 2048,
+            "n_hidden": 8192,
+            "gqa_query_heads": 16,
+            "cross_query_heads": 16,
+            "kv_heads": 4,
+            "gqa_head_dim": 128,
+            "cross_head_dim": 128,
+            "d_model" : 256
+        },
+        "src_vocab_size": 256,
+        "tgt_vocab_size": 1028,
+        "dropout": 0.0
+    },
+    "training": {
+        "dtype": "bfloat16"
+    },
+    "data": {
+        "text_length": 512,
+        "audio_length": 1536,
+        "channels": 9,
+        "text_pad_value": 0,
+        "audio_eos_value": 1024,
+        "audio_pad_value": 1025,
+        "audio_bos_value": 1026,
+        "delay_pattern": [
+            0,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+        ]
+    }
+}

dia/config.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Configuration management module for the Dia model.
+This module provides comprehensive configuration management for the Dia model,
+utilizing Pydantic for validation. It defines configurations for data processing,
+model architecture (encoder and decoder), and training settings.
+Key components:
+- DataConfig: Parameters for data loading and preprocessing.
+- EncoderConfig: Architecture details for the encoder module.
+- DecoderConfig: Architecture details for the decoder module.
+- ModelConfig: Combined model architecture settings.
+- TrainingConfig: Training hyperparameters and settings.
+- DiaConfig: Master configuration combining all components.
+"""
+import os
+from typing import Annotated
+from pydantic import BaseModel, BeforeValidator, Field
+class DataConfig(BaseModel, frozen=True):
+    """Configuration for data loading and preprocessing.
+    Attributes:
+        text_length: Maximum length of text sequences (must be multiple of 128).
+        audio_length: Maximum length of audio sequences (must be multiple of 128).
+        channels: Number of audio channels.
+        text_pad_value: Value used for padding text sequences.
+        audio_eos_value: Value representing the end of audio sequences.
+        audio_bos_value: Value representing the beginning of audio sequences.
+        audio_pad_value: Value used for padding audio sequences.
+        delay_pattern: List of delay values for each audio channel.
+    """
+    text_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = Field(gt=0, multiple_of=128)
+    audio_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = Field(gt=0, multiple_of=128)
+    channels: int = Field(default=9, gt=0, multiple_of=1)
+    text_pad_value: int = Field(default=0)
+    audio_eos_value: int = Field(default=1024)
+    audio_pad_value: int = Field(default=1025)
+    audio_bos_value: int = Field(default=1026)
+    delay_pattern: list[Annotated[int, Field(ge=0)]] = Field(default_factory=lambda: [0, 8, 9, 10, 11, 12, 13, 14, 15])
+    def __hash__(self) -> int:
+        """Generate a hash based on all fields of the config."""
+        return hash(
+            (
+                self.text_length,
+                self.audio_length,
+                self.channels,
+                self.text_pad_value,
+                self.audio_pad_value,
+                self.audio_bos_value,
+                self.audio_eos_value,
+                tuple(self.delay_pattern),
+            )
+        )
+class EncoderConfig(BaseModel, frozen=True):
+    """Configuration for the encoder component of the Dia model.
+    Attributes:
+        n_layer: Number of transformer layers.
+        n_embd: Embedding dimension.
+        n_hidden: Hidden dimension size in the MLP layers.
+        n_head: Number of attention heads.
+        head_dim: Dimension per attention head.
+        mlp_activations: List of activation functions for the MLP layers.
+        use_pre_norm: Whether to use pre-normalization (LayerNorm before attention/MLP).
+    """
+    n_layer: int = Field(gt=0)
+    n_embd: int = Field(gt=0)
+    n_hidden: int = Field(gt=0)
+    n_head: int = Field(gt=0)
+    head_dim: int = Field(gt=0)
+    mlp_activations: list[str] = Field(default=["silu", "linear"])
+    use_pre_norm: bool = Field(default=False)
+class DecoderConfig(BaseModel, frozen=True):
+    """Configuration for the decoder component of the Dia model.
+    Attributes:
+        n_layer: Number of transformer layers.
+        n_embd: Embedding dimension.
+        n_hidden: Hidden dimension size in the MLP layers.
+        gqa_query_heads: Number of query heads for grouped-query self-attention.
+        kv_heads: Number of key/value heads for grouped-query self-attention.
+        gqa_head_dim: Dimension per query head for grouped-query self-attention.
+        cross_query_heads: Number of query heads for cross-attention.
+        cross_head_dim: Dimension per cross-attention head.
+        mlp_activations: List of activation functions for the MLP layers.
+        use_pre_norm: Whether to use pre-normalization.
+    """
+    n_layer: int = Field(gt=0)
+    n_embd: int = Field(gt=0)
+    n_hidden: int = Field(gt=0)
+    gqa_query_heads: int = Field(gt=0)
+    kv_heads: int = Field(gt=0)
+    gqa_head_dim: int = Field(gt=0)
+    cross_query_heads: int = Field(gt=0)
+    cross_head_dim: int = Field(gt=0)
+    mlp_activations: list[str] = Field(default=["silu", "linear"])
+    use_pre_norm: bool = Field(default=False)
+class ModelConfig(BaseModel, frozen=True):
+    """Main configuration container for the Dia model architecture.
+    Attributes:
+        encoder: Configuration for the encoder component.
+        decoder: Configuration for the decoder component.
+        src_vocab_size: Size of the source (text) vocabulary.
+        tgt_vocab_size: Size of the target (audio code) vocabulary.
+        dropout: Dropout probability applied within the model.
+        normalization_layer_epsilon: Epsilon value for normalization layers (e.g., LayerNorm).
+        weight_dtype: Data type for model weights (e.g., "float32", "bfloat16").
+        rope_min_timescale: Minimum timescale for Rotary Positional Embeddings (RoPE).
+        rope_max_timescale: Maximum timescale for Rotary Positional Embeddings (RoPE).
+    """
+    encoder: EncoderConfig
+    decoder: DecoderConfig
+    src_vocab_size: int = Field(default=128, gt=0)
+    tgt_vocab_size: int = Field(default=1028, gt=0)
+    dropout: float = Field(default=0.0, ge=0.0, lt=1.0)
+    normalization_layer_epsilon: float = Field(default=1.0e-5, ge=0.0)
+    weight_dtype: str = Field(default="float32", description="Weight precision")
+    rope_min_timescale: int = Field(default=1, description="Timescale For global Attention")
+    rope_max_timescale: int = Field(default=10_000, description="Timescale For global Attention")
+class TrainingConfig(BaseModel, frozen=True):
+    """Training process configuration and hyperparameters.
+    Note: This configuration currently only includes precision settings.
+    Other training parameters (like batch size, learning rate, optimizer settings)
+    are assumed to be handled externally.
+    Attributes:
+        dtype: Data type for activations during training (e.g., "bfloat16", "float32").
+        logits_dot_in_fp32: Whether to compute the final logits dot product in fp32 for stability.
+    """
+    dtype: str = Field(default="bfloat16", description="Activation precision")
+    logits_dot_in_fp32: bool = Field(default=False)
+class DiaConfig(BaseModel, frozen=True):
+    """Master configuration for the Dia model.
+    Combines all sub-configurations into a single validated object.
+    Attributes:
+        version: Configuration version string.
+        model: Model architecture configuration.
+        training: Training process configuration (precision settings).
+        data: Data loading and processing configuration.
+    """
+    version: str = Field(default="1.0")
+    model: ModelConfig
+    training: TrainingConfig
+    data: DataConfig
+    def save(self, path: str) -> None:
+        """Save the current configuration instance to a JSON file.
+        Ensures the parent directory exists and the file has a .json extension.
+        Args:
+            path: The target file path to save the configuration.
+        Raises:
+            ValueError: If the path is not a file with a .json extension.
+        """
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        config_json = self.model_dump_json(indent=2)
+        with open(path, "w") as f:
+            f.write(config_json)
+    @classmethod
+    def load(cls, path: str) -> "DiaConfig | None":
+        """Load and validate a Dia configuration from a JSON file.
+        Args:
+            path: The path to the configuration file.
+        Returns:
+            A validated DiaConfig instance if the file exists and is valid,
+            otherwise None if the file is not found.
+        Raises:
+            ValueError: If the path does not point to an existing .json file.
+            pydantic.ValidationError: If the JSON content fails validation against the DiaConfig schema.
+        """
+        try:
+            with open(path, "r") as f:
+                content = f.read()
+            return cls.model_validate_json(content)
+        except FileNotFoundError:
+            return None

dia/config_inference.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+    "version": "0.1",
+    "model": {
+        "encoder": {
+            "n_layer": 12,
+            "n_embd": 1024,
+            "n_hidden": 4096,
+            "n_head": 16,
+            "head_dim": 128
+        },
+        "decoder": {
+            "n_layer": 18,
+            "n_embd": 2048,
+            "n_hidden": 8192,
+            "gqa_query_heads": 16,
+            "cross_query_heads": 16,
+            "kv_heads": 4,
+            "gqa_head_dim": 128,
+            "cross_head_dim": 128,
+            "d_model" : 256
+        },
+        "src_vocab_size": 256,
+        "tgt_vocab_size": 1028,
+        "dropout": 0.0
+    },
+    "training": {
+        "dtype": "float32"
+    },
+    "data": {
+        "text_length": 512,
+        "audio_length": 1536,
+        "channels": 9,
+        "text_pad_value": 0,
+        "audio_eos_value": 1024,
+        "audio_pad_value": 1025,
+        "audio_bos_value": 1026,
+        "delay_pattern": [
+            0,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15
+        ]
+    }
+}

dia/convert_ckpt.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import argparse
+import torch
+from dia.layers import DiaModel  # adjust your import if needed
+from dia.config import DiaConfig
+def convert_checkpoint(input_ckpt: str, output_ckpt: str, config_path: str):
+    # select device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # 1) Reconstruct exactly the same compiled model you saved
+    dia_cfg = DiaConfig.load(config_path)
+    model = DiaModel(dia_cfg).to(device)
+    model = model.half()
+    model = torch.compile(model, backend="inductor")
+    # 2) Load your compiled/half checkpoint
+    state = torch.load(input_ckpt, map_location=device)
+    model.load_state_dict(state)
+    # 3) Un-wrap to the original nn.Module
+    orig = getattr(model, "_orig_mod", None) or getattr(model, "__wrapped__", None) or model
+    # 4) Cast all params & buffers back to float32
+    orig.float()
+    # 5) Save its clean, float32 state_dict
+    torch.save(orig.state_dict(), output_ckpt)
+    print(f"Saved normal FP32 checkpoint to {output_ckpt}")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert a compiled/half-precision checkpoint back to a standard FP32 state_dict."
+    )
+    parser.add_argument(
+        "--input-ckpt", "-i",
+        required=True,
+        help="Path to the half-precision compiled checkpoint (.pth) to load"
+    )
+    parser.add_argument(
+        "--output-ckpt", "-o",
+        required=True,
+        help="Path where the FP32 state_dict will be saved"
+    )
+    parser.add_argument(
+        "--config", "-c",
+        required=True,
+        help="Path to your DiaConfig JSON file"
+    )
+    args = parser.parse_args()
+    convert_checkpoint(args.input_ckpt, args.output_ckpt, args.config)
+if __name__ == "__main__":
+    main()

dia/dataset.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from pathlib import Path
+import torch
+import torchaudio
+import pandas as pd
+from torch.utils.data import Dataset
+import dac
+from .config import DiaConfig
+class LocalDiaDataset(Dataset):
+    """Dataset loader from a local CSV (pipe-separated) and audio folder."""
+    def __init__(self, csv_path: Path, audio_root: Path, config: DiaConfig, dac_model: dac.DAC):
+        self.df = pd.read_csv(csv_path, sep=r"\s*\|\s*", engine="python", names=["audio", "text", "channel"])
+        self.audio_root = audio_root
+        self.config = config
+        self.dac_model = dac_model
+    def __len__(self) -> int:
+        return len(self.df)
+    def __getitem__(self, idx: int):
+        row = self.df.iloc[idx]
+        text = row["text"]
+        channel = row.get("channel", None)
+        if channel and pd.notna(channel):
+            text = f"[{channel}]{text}"
+        audio_path = self.audio_root / row["audio"]
+        waveform, sr = torchaudio.load(audio_path)
+        if sr != 44100:
+            waveform = torchaudio.functional.resample(waveform, sr, 44100)
+        if waveform.ndim == 1:
+            waveform = waveform.unsqueeze(0)
+        elif waveform.ndim == 2:
+            waveform = waveform[:1]  # only take 1 channel if stereo
+        waveform = waveform.unsqueeze(0)  # (1, 1, T)
+        with torch.no_grad():
+            audio_tensor = self.dac_model.preprocess(waveform, 44100).to(
+                next(self.dac_model.parameters()).device
+            )
+            _, encoded, *_ = self.dac_model.encode(audio_tensor, n_quantizers=None)
+            encoded = encoded.squeeze(0).transpose(0, 1)  # (T, C)
+        return text, encoded, waveform
+class HFDiaDataset(Dataset):
+    """Dataset loader từ Hugging Face Datasets object."""
+    def __init__(self, hf_dataset, config: DiaConfig, dac_model: dac.DAC):
+        self.dataset = hf_dataset
+        self.config = config
+        self.dac_model = dac_model
+    def __len__(self) -> int:
+        return len(self.dataset)
+    def __getitem__(self, idx: int):
+        sample = self.dataset[idx]
+        # Xử lý text tag
+        text = sample["text"]
+        channel = sample.get("channel", None)
+        lang = sample.get("language", None)
+        if channel and isinstance(channel, str) and channel.strip():
+            text = f"[{channel}]{text}"
+        elif lang and isinstance(lang, str):
+            text = f"[{lang}]{text}"
+        # Xử lý audio
+        audio_info = sample["audio"]
+        waveform = torch.tensor(audio_info["array"], dtype=torch.float32)
+        # Đảm bảo waveform shape (1, 1, T)
+        if waveform.ndim == 1:
+            waveform = waveform.unsqueeze(0).unsqueeze(0)
+        elif waveform.ndim == 2:
+            waveform = waveform[:1].unsqueeze(0)  # lấy 1 channel đầu
+        # Resample nếu không phải 44100 Hz
+        sr = audio_info.get("sampling_rate", 44100)
+        if sr != 44100:
+            waveform = torchaudio.functional.resample(waveform, sr, 44100)
+        with torch.no_grad():
+            audio_tensor = self.dac_model.preprocess(waveform, 44100).to(next(self.dac_model.parameters()).device)
+            _, encoded, *_ = self.dac_model.encode(audio_tensor, n_quantizers=None)
+            encoded = encoded.squeeze(0).transpose(0, 1)  # (T, C)
+        return text, encoded, waveform
+class HFDiaIterDataset(torch.utils.data.IterableDataset):
+    """Iterable wrapper for a HF streaming Dataset that has `audio.array` & `text`."""
+    def __init__(self, hf_iterable, config: DiaConfig, dac_model: dac.DAC):
+        super().__init__()
+        self.dataset = hf_iterable
+        self.config = config
+        self.dac_model = dac_model
+    def __iter__(self):
+        for sample in self.dataset:
+            lang = sample.get("language", None)
+            # Lấy thông tin channel và chuẩn hóa
+            channel = sample.get("channel", "").replace("@", "").lower()
+            speaker_tag = f"[{channel}]" if channel else "[unk]"
+            # Ghép tag speaker + text
+            text = speaker_tag + sample["text"]
+            audio_info = sample['audio']
+            waveform = torch.tensor(audio_info['array'], dtype=torch.float32)
+            if waveform.ndim == 1:
+                waveform = waveform.unsqueeze(0).unsqueeze(0)
+            elif waveform.ndim == 2:
+                waveform = waveform.unsqueeze(0)
+            sr = audio_info.get('sampling_rate', 44100)
+            if sr != 44100:
+                waveform = torchaudio.functional.resample(waveform, sr, 44100)
+            with torch.no_grad():
+                audio_tensor = (
+                    self.dac_model.preprocess(waveform, 44100)
+                    .to(next(self.dac_model.parameters()).device)
+                )
+                _, encoded, *_ = self.dac_model.encode(audio_tensor, n_quantizers=None)
+                encoded = encoded.squeeze(0).transpose(0, 1)
+            yield text, encoded, waveform
+from .dataset import HFDiaIterDataset
+class VietnameseDiaDataset(HFDiaIterDataset):
+    def __init__(self, dataset, dia_cfg, dac_model):
+        super().__init__(dataset, dia_cfg, dac_model)
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        # 1) Thêm tag ngôn ngữ [vi]
+        text = item["text"]
+        if not text.startswith("[vi]"):
+            text = f"[vi]{text}"
+        # 2) Xử lý audio về 44.1 kHz
+        audio_array = item["audio"]["array"]
+        sr = item["audio"]["sampling_rate"]
+        if sr != 44100:
+            audio_array = torchaudio.functional.resample(
+                torch.tensor(audio_array),
+                orig_freq=sr,
+                new_freq=44100
+            ).numpy()
+        # 3) Mã hoá DAC (tần số codec) từ đoạn audio
+        encoding = self.get_dac_encoding(audio_array)
+        return text, encoding, audio_array

dia/finetune.py ADDED Viewed

	@@ -0,0 +1,787 @@

+import argparse
+import logging
+import os
+import random
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+import torch
+import torchaudio
+import pandas as pd
+from torch.utils.data import Dataset, DataLoader, random_split
+from torch.cuda.amp import autocast
+from torch.utils.tensorboard import SummaryWriter
+from torch.nn.utils import clip_grad_norm_
+from transformers import get_scheduler
+import torch.nn.functional as F
+import bitsandbytes as bnb
+from tqdm import tqdm
+from datasets import load_dataset, interleave_datasets, get_dataset_config_names, DatasetDict
+from huggingface_hub import hf_hub_download
+import math
+import gc
+from torch.cuda.amp import GradScaler
+import dac
+from .config import DiaConfig
+from .layers import DiaModel
+from .model import Dia
+from .audio import build_delay_indices, apply_audio_delay
+from .dataset import *
+from .interleaved_datasets import load_cml_tts_streamed, load_common_voice17_streamed
+from datasets import load_from_disk
+from .dataset import HFDiaDataset
+from tqdm import tqdm
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+)
+logger = logging.getLogger(__name__)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.backends.cudnn.benchmark = True
+#bytes for language tag replacement
+LANG2BYTE = {
+    "en": 3,
+    "vi": 19,
+}
+CHANNELS = [
+    "5phutcrypto",
+    "anhbanthan",
+    "anhthamtu",
+    "animerewind.official",
+    "bibitv8888",
+    "btvgo",
+    "baclieutv",
+    "bachhoaxanhcom",
+    "baodientuvov",
+    "blvckvines",
+    "boringppl",
+    "bronub",
+    "cdteam-why",
+    "cobabinhduong",
+    "cosmicwriter",
+    "cuthongthai",
+    "daiphatthanhtruyenhinhsonla",
+    "day-be-thong-minh-tv",
+    "danangtv",
+    "daihanoi-htv",
+    "daiptththainguyentntv",
+    "dongmauviet",
+    "dongthaptv",
+    "fptbongdaofficial",
+    "fonosvietnam",
+    "hieurotrong5phut-ntkt",
+    "htvtintuc",
+    "happyhidari",
+    "hoabinhtvgo",
+    "hocenglishonline",
+    "hocvienbovagau",
+    "hungyentvvngo",
+    "huynhduykhuongofficial",
+    "huynhlapofficial",
+    "jvevermind",
+    "kenhvtc16",
+    "kiengiangtv",
+    "khanhvyofficial",
+    "kienthucquansu",
+    "lamdongtv1",
+    "lamvlog",
+    "longantv-la34",
+    "mangovid",
+    "mensbay",
+    "meovatcuocsonglnv",
+    "meuchannel",
+    "ntnvlogsnguyenthanhnam",
+    "ngamradio",
+    "nhanhac555",
+    "nhantaidaiviet",
+    "ptth-trt",
+    "ptvtruyenhinhphutho",
+    "phantichgame",
+    "phephim",
+    "phimhottk-l",
+    "riwaylegal",
+    "ruangao",
+    "suckhoetamsinh",
+    "sachbiquyethanhcong",
+    "soisangbrightsidevietnamese",
+    "spiderum",
+    "spiderumbooks",
+    "sukieskitchen",
+    "tin3phut",
+    "tranthanhtown",
+    "tulemientay",
+    "tayninhtv",
+    "thainhitv",
+    "thanhpahm",
+    "thegioilaptop",
+    "thepresentwriter",
+    "tiengiangtivi",
+    "tieubaobaothom",
+    "tintucbitcoin247",
+    "truyenhinhbinhphuoc-bptv",
+    "truyenhinhyenbaiytv",
+    "truyenhinhcaobang",
+    "truyenhinhdaklakdrt",
+    "truyenhinhdaknong1",
+    "truyenhinhdienbien23.9",
+    "truyenhinhkhanhhoa",
+    "truyenhinhkontumkrt",
+    "truyenhinhnaminhntv",
+    "truyenhinhninhthuan",
+    "truyenhinhquangngai",
+    "tuantienti2911",
+    "tuyenquangttv",
+    "vovlivedoctruyen",
+    "vietcetera",
+    "vinhlongtv",
+    "voizfm",
+    "vutrunguyenthuy",
+    "vuive",
+    "w2wanime",
+    "w2wcartoon",
+    "w2whorror",
+    "w2wmovie",
+    "web5ngay",
+    "xanh24h",
+    "aiphatthanhtruyenhinhquangtri",
+    "aiphatthanhvatruyenhinhhai1908",
+    "altonghop",
+    "antvtruyenhinhcongannhandan",
+    "baihoc10phut",
+    "battlecry.khampha",
+    "betterversionvn",
+    "blogkhoinghiep",
+    "bumcn",
+    "caikinhdi_vn",
+    "canthitg",
+    "chanthienmybachnien",
+    "chauanhchao",
+    "cosu",
+    "cungmaivaobep-monan-amthuc",
+    "daiptthphuyen",
+    "daiptthtv",
+    "daitruyenhinhangiang",
+    "daitruyenhinhbacgiang",
+    "dannytran2375",
+    "daybehoc5489",
+    "daylaphegame",
+    "dienmay",
+    "ducisreal",
+    "duongfg",
+    "duyluandethuong",
+    "duythanhish",
+    "elroydevops",
+    "gc.gamelab",
+    "hacthaybachthay",
+    "hagiangtv475",
+    "haiduongtv247",
+    "hanamtv8831",
+    "hangphimtailieudienanhnd",
+    "haugiangtv",
+    "haunauday",
+    "hieu-tv",
+    "hoshiphan",
+    "jakinatsumi2915",
+    "kechuyentieuhoc1719",
+    "kenhcovan",
+    "khalid_dinh",
+    "kiaralah",
+    "laichautv",
+    "langsontvtube",
+    "megame_official",
+    "minvestvn",
+    "nguoithanhcong1991",
+    "nhatkycuocsong.",
+    "ntcanima",
+    "ptthbentre",
+    "ptthquangbinh",
+    "qrt",
+    "quangninhtv",
+    "snewsvn",
+    "soctrangtv",
+    "sunhuynpodcast",
+    "tamhonanuong",
+    "tgddreview",
+    "thaibinhtv",
+    "thanhnamedu",
+    "thanhnientvnews",
+    "thbrt",
+    "thieunhitv3630",
+    "thtpct",
+    "tinnhanh3phut868",
+    "toansam",
+    "toidicodedaoblog",
+    "tranquochuywecommit",
+    "tranvyvy",
+    "truyenhinh4k",
+    "truyenhinhbinhthuan",
+    "truyenhinhcamau69",
+    "truyenhinhdongnai_dnrtv",
+    "truyenhinhgialai",
+    "truyenhinhlaocai",
+    "truyenhinhnghean",
+    "truyenhinhvinhphuc",
+    "txtofficial8798",
+    "vanhkhuyenle",
+    "vietnh1009",
+    "visaothenhipodcast",
+    "vtc14",
+    "vtcnow",
+    "vtv24",
+    "vuive123",
+    "zombiev4",
+]
+# Tự động ánh xạ channel → token (bắt đầu từ 30)
+for i, ch in enumerate(CHANNELS):
+    LANG2BYTE[ch] = 30 + i
+test_sentences = {
+    "en": "In order to fully assess performance and the accuracy of language tags, this test sentence contains multiple subordinate clauses, varied punctuation, and a sufficient word count.",
+    "vi": "Để đánh giá toàn diện hiệu suất và độ chính xác của các thẻ ngôn ngữ, câu kiểm tra này chứa nhiều mệnh đề phụ, dấu câu đa dạng và số lượng từ đầy đủ."
+}
+@dataclass
+class TrainConfig:
+    epochs: int = 1
+    batch_size: int = 2
+    grad_accum_steps: int = 2
+    learning_rate: float = 1e-5
+    warmup_steps: int = 500
+    unconditional_frac: float = 0.15
+    eval_step: int = 200
+    save_step: int = 2000
+    split_ratio: float = 0.997
+    shuffle_buffer_size: int = None  # for streaming shuffle
+    seed: int = 42                # seed for reproducibility
+    runs_dir: Path = Path("runs")
+    run_name: str = "dia_finetune_cv"
+    output_dir: Path = Path(".cpkts/dia_finetune_cv ")
+    resume_from: Path = None
+    total_steps: int = 290007
+def get_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Train the Dia audio model")
+    parser.add_argument("--config",    type=Path, default=Path("dia/config.json"))
+    parser.add_argument("--dataset",   type=str,  default="Paradoxia/opendata-iisys-hui",
+                        help="HuggingFace dataset name (if not using --csv_path).")
+    parser.add_argument("--dataset2",  type=str,  default=None,
+                        help="(Optional) second HF dataset to interleave (streaming)")
+    parser.add_argument("--streaming",action="store_true",
+                        help="Enable HuggingFace dataset streaming")
+    parser.add_argument("--hub_model", type=str,  default="nari-labs/Dia-1.6B")
+    parser.add_argument("--local_ckpt", type=str,  default=None)
+    parser.add_argument("--csv_path",  type=Path, default=None,
+                        help="Path to local CSV/TSV file with `audio|text` (if you want to train locally).")
+    parser.add_argument("--audio_root",type=Path, default=None,
+                        help="Root directory for local audio files (required if --csv_path is set).")
+    parser.add_argument("--run_name",  type=str,  default=None)
+    parser.add_argument("--output_dir",type=Path, default=None)
+    parser.add_argument("--shuffle_buffer_size", type=int, default=None,
+                        help="Buffer size for streaming dataset shuffle.")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="Random seed for reproducibility.")
+    parser.add_argument("--half", action="store_true", help="load model in fp16")
+    parser.add_argument("--compile", action="store_true", help="torch compile model")
+    parser.add_argument('--use_amp', action='store_true', help='Enable mixed precision')
+    parser.add_argument("--resume_from", type=str, default=None)
+    return parser.parse_args()
+def collate_fn(batch, config: DiaConfig, device: torch.device):
+    from torch.nn.functional import pad
+    texts, encodings, waveforms = zip(*batch)
+    # -- Text inputs ---------------------------------------------------------
+    max_text = config.data.text_length
+    pad_tok = config.data.text_pad_value
+    text_ids = []
+    for txt in texts:
+        b_full = txt.encode('utf-8')
+        # replace leading "[lang]" prefix
+        for code, val in LANG2BYTE.items():
+            prefix = f"[{code}]".encode('utf-8')
+            if b_full.startswith(prefix):
+                b_full = bytes([val]) + b_full[len(prefix):]
+                break
+        bts = b_full[:max_text]
+        arr = list(bts) + [pad_tok] * (max_text - len(bts))
+        text_ids.append(torch.tensor(arr, dtype=torch.long))
+    src = torch.stack(text_ids).to(device)
+    src_pos = torch.arange(max_text, device=device).unsqueeze(0).expand(src.size(0), -1)
+    src_pad = src.ne(pad_tok)
+    enc_self_attn_mask = (src_pad.unsqueeze(2) & src_pad.unsqueeze(1)).unsqueeze(1)
+    # -- Audio codes --------------------------------------------------------
+    max_audio = config.data.audio_length
+    # per-sample lengths (clipped to max_audio)
+    seq_lens = [min(e.size(0), max_audio) for e in encodings]
+    batch_max = max(seq_lens)
+    # pad or trim each encoding to the batch max length
+    padded = [pad(e, (0, 0, 0, batch_max - e.size(0))) if e.size(0) < batch_max else e[:batch_max]
+              for e in encodings]
+    codes = torch.stack(padded).to(device)  # (B, T=batch_max, C)
+    B, T, C = codes.shape
+    t_idx, idxs = build_delay_indices(B, T, C, config.data.delay_pattern)
+    delayed = apply_audio_delay(
+        codes,
+        config.data.audio_pad_value,
+        config.data.audio_bos_value,
+        (t_idx, idxs)
+    )
+    # ensure no longer than max_audio
+    delayed = delayed[:, :max_audio, :]
+    # -- Targets with per-sample EOS ----------------------------------------
+    max_tgt_len = max_audio + 2
+    pad_val = config.data.audio_pad_value
+    bos_val = config.data.audio_bos_value
+    eos_val = config.data.audio_eos_value
+    tgt = torch.full((B, max_tgt_len, C), pad_val, dtype=torch.long, device=device)
+    tgt[:, 0, :] = bos_val
+    tgt_lens = []
+    for i, L in enumerate(seq_lens):
+        tgt[i, 1:1 + L, :] = delayed[i, :L, :]
+        tgt[i, 1 + L, :] = eos_val
+        tgt_lens.append(1 + L + 1)
+    tgt_pos = torch.arange(max_tgt_len, device=device).unsqueeze(0).expand(B, -1)
+    tgt_pad = tgt.ne(pad_val).any(-1)
+    causal = torch.tril(torch.ones((max_tgt_len, max_tgt_len),
+                                    dtype=torch.bool,
+                                    device=device))
+    dec_self_attn_mask = (tgt_pad.unsqueeze(2) & tgt_pad.unsqueeze(1) & causal).unsqueeze(1)
+    dec_cross_attn_mask = (tgt_pad.unsqueeze(2) & src_pad.unsqueeze(1)).unsqueeze(1)
+    return {
+        'src_tokens': src,
+        'src_positions': src_pos,
+        'enc_self_attn_mask': enc_self_attn_mask,
+        'tgt_tokens': tgt,
+        'tgt_positions': tgt_pos,
+        'dec_self_attn_mask': dec_self_attn_mask,
+        'dec_cross_attn_mask': dec_cross_attn_mask,
+        'waveforms': waveforms,
+        'raw_text': texts[0],
+        'tgt_lens': torch.tensor(tgt_lens, dtype=torch.long, device=device),
+    }
+def setup_loaders(dataset, dia_cfg: DiaConfig, train_cfg: TrainConfig, device):
+    collate = lambda b: collate_fn(b, dia_cfg, device)
+    if isinstance(dataset, HFDiaIterDataset):
+        total = getattr(dataset, "total_examples", None)
+        if total is None:
+            total = dataset.dataset.info.splits["train"].num_examples
+        n_train = int(train_cfg.split_ratio * total)
+        n_val = total - n_train
+        if n_val <= 0:
+            raise RuntimeError(f"No validation samples: total={total}, split_ratio={train_cfg.split_ratio}")
+        base = dataset.dataset.shuffle(buffer_size=train_cfg.shuffle_buffer_size, seed=train_cfg.seed) if train_cfg.shuffle_buffer_size else dataset.dataset
+        val_stream = base.take(n_val)
+        train_stream = base.skip(n_val)
+        train_ds = HFDiaIterDataset(train_stream, dia_cfg, dataset.dac_model)
+        val_ds = HFDiaIterDataset(val_stream, dia_cfg, dataset.dac_model)
+        train_loader = DataLoader(train_ds, batch_size=train_cfg.batch_size, shuffle=False, collate_fn=collate)
+        train_loader.steps_per_epoch = math.ceil(n_train / train_cfg.batch_size)
+        val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, collate_fn=collate)
+        return train_loader, val_loader
+    ds_len = len(dataset)
+    n_train = int(train_cfg.split_ratio * ds_len)
+    train_ds, val_ds = random_split(dataset, [n_train, ds_len - n_train])
+    train_loader = DataLoader(train_ds, batch_size=train_cfg.batch_size, shuffle=True, collate_fn=collate)
+    val_loader = DataLoader(val_ds, batch_size=1, shuffle=False, collate_fn=collate)
+    return train_loader, val_loader
+def setup_optimizer_and_scheduler(model, train_loader, train_cfg):
+    opt = bnb.optim.AdamW8bit(model.parameters(), lr=train_cfg.learning_rate)
+    # Determine steps per epoch: prefer len(), else use attached attribute
+    try:
+        steps_per_epoch = len(train_loader)
+    except TypeError:
+        if hasattr(train_loader, 'steps_per_epoch'):
+            steps_per_epoch = train_loader.steps_per_epoch
+        else:
+            raise RuntimeError("Cannot determine steps_per_epoch for streaming loader")
+    total_training_steps = steps_per_epoch * train_cfg.epochs
+    sched = get_scheduler(
+        'cosine', opt,
+        num_warmup_steps=train_cfg.warmup_steps / train_cfg.grad_accum_steps,
+        num_training_steps=total_training_steps / train_cfg.grad_accum_steps
+    )
+    return opt, sched
+def train_step(model, batch, dia_cfg, train_cfg, opt, sched, writer, step_in_epoch, global_step,scaler):
+    """
+    Perform a single training step: forward, loss, backward, update, log.
+    Now uses per‑sample tgt_lens to mask out padding after each EOS,
+    and applies 4× loss weight on the first channel.
+    """
+    # (optional) unconditional conditioning
+    if random.random() < train_cfg.unconditional_frac:
+        pad_tok = dia_cfg.data.text_pad_value
+        batch['src_tokens'] = torch.zeros_like(batch['src_tokens'])
+        batch['enc_self_attn_mask'] = torch.zeros_like(batch['enc_self_attn_mask'])
+        batch['dec_cross_attn_mask'] = torch.zeros_like(batch['dec_cross_attn_mask'])
+    with autocast(dtype=torch.float16):
+        # forward pass
+        logits = model(
+            src_BxS=batch['src_tokens'],
+            tgt_BxTxC=batch['tgt_tokens'],
+            src_positions=batch['src_positions'],
+            tgt_positions=batch['tgt_positions'],
+            enc_self_attn_mask=batch['enc_self_attn_mask'],
+            dec_self_attn_mask=batch['dec_self_attn_mask'],
+            dec_cross_attn_mask=batch['dec_cross_attn_mask'],
+            enable_dropout=True,
+        )
+        # fetch per-sample target‑lengths (including BOS+frames+EOS)
+        lens = batch['tgt_lens']                   # shape: (B,)
+        max_L = int(lens.max().item())             # maximum over batch
+        # keep only up through the last possible EOS slot
+        # logits: (B, T, C, V) -> (B, max_L-1, C, V)
+        logits = logits[:, : max_L - 1]
+        # targets: shift off the BOS so 0..<max_L-1> align with logits
+        # target: (B, T, C) -> (B, max_L-1, C)
+        target = batch['tgt_tokens'][:, 1:max_L, :]
+        B, Tm1, C = target.shape
+        pad_val = dia_cfg.data.audio_pad_value
+        # build a mask [B x (max_L-1)] that is True for t < (lens[i]-1)
+        time_idx = torch.arange(Tm1, device=lens.device).unsqueeze(0)  # (1, Tm1)
+        valid_time = time_idx < (lens.unsqueeze(1) - 1)                # (B, Tm1)
+        mask = valid_time.unsqueeze(-1).expand(-1, -1, C)             # (B, Tm1, C)
+        # apply 4× weight on first channel, 1× on others
+        channel_weights = [4.0] + [1.0] * (C - 1)
+        loss_c = 0.0
+        _, _, _, V = logits.size()
+        for c, w in enumerate(channel_weights):
+            # flatten this channel
+            lc = logits[:, :, c, :].reshape(-1, V)   # (B*Tm1, V)
+            tc = target[:, :, c].reshape(-1)         # (B*Tm1,)
+            mc = mask[:, :, c].reshape(-1)           # (B*Tm1,)
+            # mask out padding and compute cross-entropy
+            lc_valid = lc[mc]
+            tc_valid = tc[mc]
+            loss_c += w * F.cross_entropy(
+                lc_valid, tc_valid,
+                ignore_index=pad_val
+            )
+        # normalize by sum of weights
+        loss = loss_c / sum(channel_weights)
+    # scale + backward
+    loss = loss / train_cfg.grad_accum_steps
+    scaler.scale(loss).backward()
+    # step & log
+    if (step_in_epoch + 1) % train_cfg.grad_accum_steps == 0:
+        # Unscale before clipping
+        scaler.unscale_(opt)
+        grad_norm = clip_grad_norm_(model.parameters(), max_norm=1e9)
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad()
+        sched.step()
+        true_loss = loss.item() * train_cfg.grad_accum_steps
+        current_lr = sched.get_last_lr()[0]
+        writer.add_scalar('GradNorm/global', grad_norm, global_step)
+        writer.add_scalar('LR', current_lr, global_step)
+        writer.add_scalar('Loss/train', true_loss, global_step)
+        return true_loss
+    else:
+        return loss.item() * train_cfg.grad_accum_steps
+def eval_step(model, val_loader, dia_cfg, dac_model, writer, global_step):
+    """
+    Run evaluation: compute average loss on validation set and log audio samples.
+    """
+    import gc
+    eval_losses = []
+    last_batch = None
+    with torch.inference_mode():
+        for eb in tqdm(val_loader, desc="eval"):
+            last_batch = eb
+            with autocast(dtype=torch.float16):
+                logits16 = model(
+                    src_BxS=eb['src_tokens'],
+                    tgt_BxTxC=eb['tgt_tokens'],
+                    src_positions=eb['src_positions'],
+                    tgt_positions=eb['tgt_positions'],
+                    enc_self_attn_mask=eb['enc_self_attn_mask'],
+                    dec_self_attn_mask=eb['dec_self_attn_mask'],
+                    dec_cross_attn_mask=eb['dec_cross_attn_mask'],
+                    enable_dropout=False,
+                )[:, :-1]
+            logits = logits16.float()
+            target = eb['tgt_tokens'][:, 1:]
+            B_e, T_e, C_e = target.shape
+            V_e = logits.size(-1)
+            loss_e = 0.0
+            weights_e = [4.0] + [1.0] * (C_e - 1)
+            for c, w in enumerate(weights_e):
+                lc = logits[:, :, c, :].reshape(-1, V_e)
+                tc = target[:, :, c].reshape(-1)
+                loss_e += w * F.cross_entropy(
+                    lc, tc, ignore_index=dia_cfg.data.audio_pad_value
+                )
+            loss_e = loss_e / sum(weights_e)
+            eval_losses.append(loss_e)
+    avg_eval_loss = sum(eval_losses) / len(eval_losses)
+    writer.add_scalar('Loss/eval', avg_eval_loss.item(), global_step)
+    # --- Inference test sentence ---
+    try:
+        orig_dtype = next(model.parameters()).dtype
+        model = model.float()
+        dia_gen = Dia(dia_cfg, device)
+        dia_gen.model, dia_gen.dac_model = model, dac_model
+        # ✅ Test câu hội thoại đa giọng
+        test_dialogue = "[vtv24] Em vừa đi học về, anh ạ. [duongfg] Ừ, em ăn cơm chưa? [vtv24] Em ăn rồi!"
+        if len(test_dialogue) > 10:
+            try:
+                audio = dia_gen.generate(text=test_dialogue)
+                writer.add_audio("Eval/test_dialogue", audio, global_step, 44100)
+            except Exception:
+                logger.exception("Eval error during test_dialogue")
+            finally:
+                if 'audio' in locals():
+                    del audio
+    except Exception:
+        logger.exception("Eval error")
+    finally:
+        if 'audio' in locals():
+            del audio
+        gc.collect()
+        torch.cuda.empty_cache()
+        if orig_dtype == torch.float16:
+            model = model.half()
+def train(model, dia_cfg: DiaConfig, dac_model: dac.DAC, dataset, train_cfg: TrainConfig):
+    """
+    Run the full training loop over epochs.
+    """
+    # prepare directories
+    train_cfg.output_dir.mkdir(parents=True, exist_ok=True)
+    (train_cfg.runs_dir / train_cfg.run_name).mkdir(parents=True, exist_ok=True)
+    model = model.to(device)
+    train_loader, val_loader = setup_loaders(dataset, dia_cfg, train_cfg, device)
+    opt, sched = setup_optimizer_and_scheduler(model, train_loader, train_cfg)
+    writer = SummaryWriter(train_cfg.runs_dir / train_cfg.run_name)
+    model.train()
+    scaler = GradScaler()
+    start_epoch = 0
+    global_step = 0
+    resume_ckpt = getattr(train_cfg, "resume_from", None)
+    if resume_ckpt and resume_ckpt.exists():
+        logger.info(f"Resuming from checkpoint: {resume_ckpt}")
+        checkpoint = torch.load(resume_ckpt, map_location=device)
+        model.load_state_dict(checkpoint["model"])
+        opt.load_state_dict(checkpoint["optimizer"])
+        sched.load_state_dict(checkpoint["scheduler"])
+        scaler.load_state_dict(checkpoint["scaler"])
+        start_epoch = checkpoint.get("epoch", 0)
+        global_step = checkpoint.get("global_step", 0)
+    steps_per_epoch = getattr(train_loader, 'steps_per_epoch', None)
+    if steps_per_epoch is None:
+        try:
+            steps_per_epoch = len(train_loader)
+        except Exception:
+            steps_per_epoch = None
+    for epoch in range(start_epoch, train_cfg.epochs):
+        # iterate with progress bar, using total if known
+        loader_iter = tqdm(
+            train_loader,
+            desc=f"E{epoch+1}",
+            total=steps_per_epoch
+        )
+        pbar = tqdm(loader_iter, total=train_cfg.total_steps, initial=global_step, desc=f"E{epoch}")
+        for step_in_epoch, batch in enumerate(pbar):
+            global_step += 1
+            # training step
+            loss = train_step(model, batch, dia_cfg, train_cfg, opt, sched, writer, step_in_epoch, global_step, scaler)
+            cur_alloc = torch.cuda.memory_allocated()   # bytes currently allocated by tensors
+            peak_alloc = torch.cuda.max_memory_allocated()  # bytes peak during program
+            # optionally convert to GB
+            cur_gb  = cur_alloc  / 1024**3
+            peak_gb = peak_alloc / 1024**3
+            # update the tqdm postfix
+            loader_iter.set_postfix({
+                'loss': f"{loss:.4f}",
+                'VRAM (GB)': f"{cur_gb:.2f}/{peak_gb:.2f}"
+            })
+            # remember to zero the peak if you want rolling peaks per step
+            if torch.cuda.is_available():
+                torch.cuda.reset_peak_memory_stats()
+            # evaluation
+            if step_in_epoch % train_cfg.eval_step == 0:
+                model.eval()
+                with torch.no_grad():
+                    eval_step(model, val_loader, dia_cfg, dac_model, writer, global_step)
+                model.train()
+                scaler = GradScaler()
+            # checkpoint
+            if step_in_epoch and step_in_epoch % train_cfg.save_step == 0:
+                ckpt = train_cfg.output_dir / f"ckpt_step{global_step}.pth"
+                torch.save({
+                    "model": model.state_dict(),
+                    "optimizer": opt.state_dict(),
+                    "scheduler": sched.state_dict(),
+                    "scaler": scaler.state_dict(),
+                    "epoch": epoch,
+                    "global_step": global_step
+                }, ckpt)
+                logger.info(f"Saved checkpoint: {ckpt}")
+        # end of epoch checkpoint
+        ckpt_e = train_cfg.output_dir / f"ckpt_epoch{epoch+1}.pth"
+        torch.save({
+            "model": model.state_dict(),
+            "optimizer": opt.state_dict(),
+            "scheduler": sched.state_dict(),
+            "scaler": scaler.state_dict(),
+            "epoch": epoch + 1,
+            "global_step": global_step
+        }, ckpt_e)
+        logger.info(f"Saved end-of-epoch checkpoint: {ckpt_e}")
+from datasets import disable_caching
+def main():
+    args = get_args()
+    import os
+    os.environ["HF_DATASETS_CACHE"] = "/tmp/force_streaming"  # ép cache mới
+    disable_caching()
+  # tắt toàn bộ cache local HuggingFace
+    import json
+    with open(args.config, "r", encoding="utf-8") as f:
+        config_dict = json.load(f)
+    dia_cfg = DiaConfig(**config_dict)
+    dac_model = dac.DAC.load(dac.utils.download()).to(device)
+    dataset = None
+    if not dataset:
+        if args.csv_path:
+            if not args.audio_root:
+                raise ValueError("`--audio_root` must be set when using `--csv_path`")
+            dataset = LocalDiaDataset(args.csv_path, args.audio_root, dia_cfg, dac_model)
+        else:
+            # ✅ Check nếu dataset là đường dẫn local
+            if Path(args.dataset).exists():
+                print(f"Loading dataset from local path: {args.dataset}")
+                ds1 = load_from_disk(args.dataset)
+                if isinstance(ds1, DatasetDict):
+                    ds1 = ds1["train"]
+                dataset = HFDiaDataset(ds1, dia_cfg, dac_model)
+            else:
+                print(f"Loading HuggingFace dataset: {args.dataset} (streaming)")
+                ds1 = load_dataset(args.dataset, split="train", streaming=True)
+                if args.dataset2:
+                    ds2 = load_dataset(args.dataset2, split="train", streaming=True)
+                    hf_ds = interleave_datasets([ds1, ds2])
+                    dataset = HFDiaIterDataset(hf_ds, dia_cfg, dac_model)
+                else:
+                    hf_ds = ds1
+                    dataset = HFDiaIterDataset(hf_ds, dia_cfg, dac_model)
+    train_cfg = TrainConfig(
+        run_name   = args.run_name   or TrainConfig.run_name,
+        output_dir = args.output_dir or TrainConfig.output_dir,
+        shuffle_buffer_size = args.shuffle_buffer_size,
+        seed = args.seed,
+    )
+    if args.resume_from:
+        train_cfg.resume_from = Path(args.resume_from)
+    # load model checkpoint
+    if args.local_ckpt:
+        ckpt_file = args.local_ckpt
+    else:
+        ckpt_file = hf_hub_download(args.hub_model, filename="dia-v0_1.pth")
+    model = DiaModel(dia_cfg)
+    if args.half:
+        model=model.half()
+    if args.compile:
+        model = torch.compile(model, backend="inductor")
+    ckpt = torch.load(ckpt_file, map_location="cpu")
+    state_dict = ckpt["model"] if "model" in ckpt else ckpt
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if "encoder.embedding.weight" in k:
+            if v.shape != model.state_dict()[k].shape:
+                print(f"⚠️ Bỏ qua {k} do shape không khớp: {v.shape} vs {model.state_dict()[k].shape}")
+                continue
+        new_state_dict[k] = v
+    model.load_state_dict(new_state_dict, strict=False)
+    # start training
+    train(model, dia_cfg, dac_model, dataset, train_cfg)
+if __name__ == "__main__":
+    main()

dia/interleaved_datasets.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from datasets import load_dataset, get_dataset_config_names, interleave_datasets, load_dataset_builder
+from .dataset import HFDiaIterDataset
+import pandas as pd
+from huggingface_hub import hf_hub_download
+LANG_NAME_TO_CODE = {
+    "dutch":      "nl",
+    "french":     "fr",
+    "german":     "de",
+    "italian":    "it",
+    "polish":     "pl",
+    "portuguese": "pt",
+    "spanish":    "es",
+    # add more if other configs appear...
+}
+def load_cml_tts_streamed(dia_cfg, dac_model):
+    """
+    Stream all language subsets of the CML-TTS dataset in train split,
+    add a `language` field, drop all except `text`, `audio`, `language`,
+    and interleave them into one streaming Dataset.
+    Returns:
+        datasets.IterableDataset: interleaved streaming dataset
+    """
+    # 1) Discover all language subsets
+    lang_configs = get_dataset_config_names("ylacombe/cml-tts")
+    # 2) Build one streaming subset per language, with only desired columns
+    streams = []
+    num_ex=0
+    for lang in lang_configs:
+        iso_code = LANG_NAME_TO_CODE.get(lang, lang)
+        ds_stream = load_dataset(
+            "ylacombe/cml-tts",
+            name=lang,
+            split="train",
+            streaming=True
+        )
+        num_ex += ds_stream.info.splits['train'].num_examples
+        # keep only text, audio, and add language
+        def _add_lang(ex, iso=iso_code):
+            return {
+                "text": ex["text"],
+                "audio": ex["audio"],
+                "language": iso
+            }
+        ds_stream = ds_stream.map(
+            _add_lang,
+            remove_columns=[c for c in ds_stream.column_names if c not in ["text", "audio", "language"]]
+        )
+        streams.append(ds_stream)
+    # 3) Interleave all streams into one unified stream
+    interleaved = interleave_datasets(streams, stopping_strategy="all_exhausted")
+    ds = HFDiaIterDataset(interleaved, dia_cfg, dac_model)
+    ds.total_examples = num_ex
+    return ds
+def count_tsv_rows(
+    repo_id: str,
+    subset: str,
+    split: str = "train",
+    revision: str = "main"
+) -> int:
+    """Download the TSV for a given subset/split and return its number of rows."""
+    file_path = f"transcript/{subset}/{split}.tsv"
+    try:
+        local_file = hf_hub_download(
+            repo_id=repo_id,
+            filename=file_path,
+            repo_type="dataset",
+            revision=revision
+        )
+    except:
+        print("error fetching tsv metadata")
+    df = pd.read_csv(local_file, sep="\t", low_memory=False)
+    return len(df)
+def load_common_voice17_streamed(dia_cfg, dac_model, revision="main"):
+    """
+    Stream the train split of Common Voice 17 for the given language codes,
+    rename `sentence`→`text`, keep only `text`, `audio`, and `language`,
+    then interleave into a single streaming Dataset.
+    Languages loaded: en, de, fr, es, it, nl, pl, pt, tr, hu
+    """
+    repo_id = "mozilla-foundation/common_voice_17_0"
+    langs = ["en", "de", "fr", "es", "it", "nl", "pl", "pt", "tr", "hu"]
+    streams = []
+    row_counts = []
+    for lang in langs:
+        # 1) figure out how many rows in the TSV
+        n_rows = count_tsv_rows(repo_id, lang, split="train", revision=revision)
+        row_counts.append(n_rows)
+        # 2) load in streaming mode
+        ds_stream = load_dataset(
+            repo_id,
+            name=lang,
+            split="train",
+            streaming=True,
+            revision=revision
+        )
+        # 3) map to desired schema
+        def _prep(ex, iso=lang):
+            return {
+                "text": ex["sentence"],
+                "audio": ex["audio"],
+                "language": iso
+            }
+        ds_stream = ds_stream.map(
+            _prep,
+            remove_columns=[c for c in ds_stream.column_names if c not in ("sentence", "audio")]
+        )
+        streams.append(ds_stream)
+    # 4) interleave: all_exhausted ⇒ max_length * num_streams
+    interleaved = interleave_datasets(streams, stopping_strategy="all_exhausted")
+    # 5) wrap and attach total_examples
+    ds = HFDiaIterDataset(interleaved, dia_cfg, dac_model)
+    ds.total_examples = max(row_counts) * len(langs)
+    return ds

dia/layers.py ADDED Viewed

	@@ -0,0 +1,909 @@

+from typing import Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn import RMSNorm
+from .config import DiaConfig
+def _normalize_axes(axes: tuple[int, ...], ndim: int) -> tuple[int, ...]:
+    return tuple(ax if ax >= 0 else ndim + ax for ax in axes)
+def _str_to_dtype(dtype_str: str) -> torch.dtype | None:
+    # Allow None for default behavior
+    if dtype_str is None or dtype_str.lower() == "none":
+        return None
+    if dtype_str == "float32":
+        return torch.float32
+    elif dtype_str == "float16":
+        return torch.float16
+    elif dtype_str == "bfloat16":
+        return torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported dtype string: {dtype_str}")
+class DenseGeneral(nn.Module):
+    """
+    PyTorch equivalent of flax.linen.DenseGeneral with shapes defined at init.
+    Stores weights (`kernel`) in the same layout as Jax and uses torch.tensordot
+    for the generalized matrix multiplication. Weight/bias shapes are calculated
+    and parameters created during initialization based on config.
+    `load_weights` validates shapes and copies data.
+    Attributes:
+        axis (Tuple[int, ...]): Input axis or axes to contract.
+        in_shapes (Tuple[int, ...]): Sizes of the input dimensions specified by `axis`.
+        out_features (Tuple[int, ...]): Shape of the output features (non-contracted dims).
+        use_bias (bool): Whether to add a bias term.
+        weight (nn.Parameter): The kernel parameter.
+        bias (Optional[nn.Parameter]): The bias parameter (if use_bias=True).
+    """
+    def __init__(
+        self,
+        in_shapes: tuple[int, ...],
+        out_features: tuple[int, ...],
+        axis: tuple[int, ...] = (-1,),
+        dtype: torch.dtype | None = None,
+        weight_dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ):
+        super().__init__()
+        self.in_shapes = in_shapes
+        self.out_features = out_features
+        self.axis = axis
+        self.dtype = dtype
+        self.kernel_shape = self.in_shapes + self.out_features
+        factory_kwargs = {"device": device, "dtype": weight_dtype}
+        self.weight = nn.Parameter(torch.empty(self.kernel_shape, **factory_kwargs))
+        self.register_parameter("bias", None)
+    def forward(self, inputs: Tensor) -> Tensor:
+        norm_axis = _normalize_axes(self.axis, inputs.ndim)
+        kernel_contract_axes = tuple(range(len(norm_axis)))
+        output = torch.tensordot(
+            inputs.float(),
+            self.weight.float(),
+            dims=(norm_axis, kernel_contract_axes),
+        ).to(inputs.dtype)
+        return output
+def get_activation_fn(activation_string: str) -> nn.Module:  # Return Module instance
+    """Maps activation string to PyTorch activation function module."""
+    if activation_string == "gelu":
+        return nn.GELU()
+    elif activation_string == "relu":
+        return nn.ReLU()
+    elif activation_string == "silu" or activation_string == "swish":
+        return nn.SiLU()
+    elif activation_string == "linear":
+        return nn.Identity()
+    else:
+        raise ValueError(f"Unsupported activation function: {activation_string}")
+class MlpBlock(nn.Module):
+    """MLP block using DenseGeneral."""
+    def __init__(
+        self,
+        config: DiaConfig,
+        embed_dim: int,
+        intermediate_dim: int,
+        dropout_rate: float,
+        activations: list[str] = ["silu", "linear"],
+        use_pre_norm: bool = False,
+    ):
+        super().__init__()
+        self.use_pre_norm = use_pre_norm
+        num_activations = len(activations)
+        compute_dtype = _str_to_dtype(config.training.dtype)
+        weight_dtype = _str_to_dtype(config.model.weight_dtype)
+        self.dtype = compute_dtype
+        # Assume default device for now, could be passed in config
+        if use_pre_norm:
+            self.pre_norm = RMSNorm(
+                embed_dim,
+                eps=config.model.normalization_layer_epsilon,
+                dtype=torch.float32,
+            )
+        self.wi_fused = DenseGeneral(
+            in_shapes=(embed_dim,),
+            out_features=(
+                num_activations,
+                intermediate_dim,
+            ),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        self.activation_fn_0 = get_activation_fn(activations[0])  # silu
+        self.activation_fn_1 = get_activation_fn(activations[1])  # linear
+        self.dropout = nn.Dropout(dropout_rate)
+        # Output layer using DenseGeneral
+        self.wo = DenseGeneral(
+            in_shapes=(intermediate_dim,),
+            out_features=(embed_dim,),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+    def forward(self, x: torch.Tensor, deterministic: bool) -> torch.Tensor:
+        """Forward pass."""
+        if self.use_pre_norm and hasattr(self, "pre_norm"):
+            x = self.pre_norm(x)
+        fused_x = self.wi_fused(x)
+        gate_input = fused_x[..., 0, :]
+        up_input = fused_x[..., 1, :]
+        gate = self.activation_fn_0(gate_input)
+        up = self.activation_fn_1(up_input)
+        hidden = torch.mul(gate, up).to(self.dtype)
+        if not deterministic:
+            hidden = self.dropout(hidden)
+        output = self.wo(hidden)
+        return output
+class RotaryEmbedding(nn.Module):
+    """Rotary Position Embedding (RoPE) implementation in PyTorch."""
+    def __init__(
+        self,
+        embedding_dims: int,
+        min_timescale: int = 1,
+        max_timescale: int = 10000,
+        dtype: torch.dtype = torch.float32,
+    ):
+        super().__init__()
+        if embedding_dims % 2 != 0:
+            raise ValueError("Embedding dim must be even for RoPE.")
+        self.embedding_dims = embedding_dims
+        self.min_timescale = min_timescale
+        self.max_timescale = max_timescale
+        self.dtype = dtype
+        half_embedding_dim = embedding_dims // 2
+        fraction = (2.0 * torch.arange(0, half_embedding_dim)) / embedding_dims
+        self.register_buffer(
+            "timescale",
+            self.min_timescale * (self.max_timescale / self.min_timescale) ** fraction,
+            persistent=False,
+        )
+    def extra_repr(self) -> str:
+        s = f"{self.timescale.shape}"
+        return s
+    def forward(self, inputs: torch.Tensor, position: torch.Tensor):
+        """Applies RoPE."""
+        position = position.unsqueeze(-1).unsqueeze(-1)
+        timescale = self.timescale.to(inputs.device)
+        sinusoid_inp = position / timescale
+        sin = torch.sin(sinusoid_inp).to(inputs.dtype)
+        cos = torch.cos(sinusoid_inp).to(inputs.dtype)
+        first_half, second_half = torch.chunk(inputs, 2, dim=-1)
+        first_part = first_half * cos - second_half * sin
+        second_part = second_half * cos + first_half * sin
+        return torch.cat((first_part, second_part), dim=-1)
+class KVCache:
+    def __init__(self, num_heads, max_len, head_dim, device, k=None, v=None):
+        self.k = torch.zeros((2, num_heads, max_len, head_dim), device=device) if k is None else k
+        self.v = torch.zeros((2, num_heads, max_len, head_dim), device=device) if v is None else v
+        self.current_idx = 0
+        self.max_len = max_len
+    def get_kv_for_attention(self, current_k, current_v):
+        if self.current_idx == 0:
+            return current_k, current_v
+        else:
+            past_k = self.k[:, :, : self.current_idx, :]
+            past_v = self.v[:, :, : self.current_idx, :]
+            attn_k = torch.cat((past_k, current_k), dim=2)
+            attn_v = torch.cat((past_v, current_v), dim=2)
+            return attn_k, attn_v
+    def update_cache(self, k, v):
+        assert self.current_idx < self.max_len
+        self.k[:, :, self.current_idx : self.current_idx + 1, :] = k
+        self.v[:, :, self.current_idx : self.current_idx + 1, :] = v
+        self.current_idx += 1
+    def prefill_kv(self, k, v):
+        prefill_len = k.shape[2]
+        assert prefill_len <= self.max_len
+        self.k[:, :, :prefill_len, :] = k
+        self.v[:, :, :prefill_len, :] = v
+        self.current_idx = prefill_len
+class Attention(nn.Module):
+    """Attention using DenseGeneral."""
+    def __init__(
+        self,
+        config: DiaConfig,
+        q_embed_dim: int,
+        kv_embed_dim: int,
+        num_query_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        dropout_rate: float,
+        is_cross_attn: bool = False,
+        out_embed_dim: int | None = None,
+    ):
+        super().__init__()
+        self.num_query_heads = num_query_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.is_cross_attn = is_cross_attn
+        self.dropout_rate = dropout_rate
+        compute_dtype = _str_to_dtype(config.training.dtype)
+        weight_dtype = _str_to_dtype(config.model.weight_dtype)
+        self.output_dim = out_embed_dim if out_embed_dim is not None else q_embed_dim
+        self.projected_query_dim = num_query_heads * head_dim
+        if num_query_heads % num_kv_heads != 0:
+            raise ValueError(f"num_query_heads ({num_query_heads}) must be divisible by num_kv_heads ({num_kv_heads})")
+        self.num_gqa_groups = num_query_heads // num_kv_heads
+        # --- Projection Layers using DenseGeneral ---
+        self.q_proj = DenseGeneral(
+            in_shapes=(q_embed_dim,),
+            out_features=(num_query_heads, head_dim),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        self.k_proj = DenseGeneral(
+            in_shapes=(kv_embed_dim,),
+            out_features=(num_kv_heads, head_dim),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        self.v_proj = DenseGeneral(
+            in_shapes=(kv_embed_dim,),
+            out_features=(num_kv_heads, head_dim),
+            axis=(-1,),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        self.o_proj = DenseGeneral(
+            in_shapes=(num_query_heads, head_dim),
+            out_features=(self.output_dim,),
+            axis=(-2, -1),
+            dtype=compute_dtype,
+            weight_dtype=weight_dtype,
+        )
+        # --- Rotary Embedding ---
+        self.rotary_emb = RotaryEmbedding(
+            embedding_dims=self.head_dim,
+            min_timescale=config.model.rope_min_timescale,
+            max_timescale=config.model.rope_max_timescale,
+            dtype=compute_dtype,
+        )
+    def forward(
+        self,
+        Xq: torch.Tensor,  # (B, T, D) T = 1 in AR generation
+        Xkv: torch.Tensor,  # (B, S, E) S = 1 in AR generation
+        q_positions: torch.Tensor,  # (B, T)
+        kv_positions: torch.Tensor | None = None,  # (B, S)
+        deterministic: bool = True,
+        attn_mask: torch.Tensor | None = None,  # None in Decoder Self Attention, Valid mask in Others
+        cache: KVCache | None = None,  # None in Encoder, KVCache in Decoder
+        prefill: bool = False,  # True only when prefilling KV Cache
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
+        """
+        Performs attention calculation with optional KV caching.
+        Args:
+            Xq: Query tensor (B, T, D). T=1 during single-step decoding.
+            Xkv: Key/Value source tensor (B, S, E). S=1 during single-step decoding for self-attn.
+            q_positions: Positions for queries (B, T).
+            kv_positions: Positions for keys/values (B, S). If None, uses q_positions.
+            deterministic: If True, disable dropout.
+            attn_mask: Attention mask.
+            cache: KVCache.
+            prefill: If True, use prefill mode.
+        Returns:
+            A tuple containing:
+            - output: The attention output tensor (B, T, output_dim).
+            - present_kv: The K/V state to be cached for the next step ((B, N, S_new, H), (B, N, S_new, H)). For self-attn, S_new = S_past + S. For cross-attn, S_new = S_kv.
+        """
+        if kv_positions is None:
+            kv_positions = q_positions
+        original_dtype = Xq.dtype
+        Xq_BxTxNxH = self.q_proj(Xq)
+        Xq_BxTxNxH = self.rotary_emb(Xq_BxTxNxH, position=q_positions)
+        Xq_BxNxTxH = Xq_BxTxNxH.transpose(1, 2)
+        # Input values into attention calculation
+        attn_k: torch.Tensor | None = None
+        attn_v: torch.Tensor | None = None
+        new_kv_cache: tuple[torch.Tensor, torch.Tensor] | None = None
+        # Decoder Cross Attention
+        if self.is_cross_attn:
+            # Directly use cache (no need to check index)
+            attn_k, attn_v = cache.k, cache.v
+            if attn_k.shape[1] != self.num_query_heads or attn_v.shape[1] != self.num_query_heads:
+                raise ValueError(
+                    f"Cross-attention cache head dimension ({attn_k.shape[1]}) "
+                    f"does not match num_query_heads ({self.num_query_heads}). "
+                    "Cache should be pre-repeated for GQA."
+                )
+        # Self Attention
+        else:
+            Xk_BxSxKxH = self.k_proj(Xkv)  # (B, S, K, H)
+            Xv_BxSxKxH = self.v_proj(Xkv)  # (B, S, K, H)
+            Xk_BxSxKxH = self.rotary_emb(Xk_BxSxKxH, position=kv_positions)  # (B, S, K, H)
+            Xk_BxKxSxH = Xk_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
+            Xv_BxKxSxH = Xv_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
+            # S=1 for Decode Step
+            if self.num_gqa_groups > 1:
+                Xk_BxNxSxH = Xk_BxKxSxH.repeat_interleave(self.num_gqa_groups, dim=1)
+                Xv_BxNxSxH = Xv_BxKxSxH.repeat_interleave(self.num_gqa_groups, dim=1)
+            else:
+                Xk_BxNxSxH = Xk_BxKxSxH
+                Xv_BxNxSxH = Xv_BxKxSxH
+            # Encoder Self Attention
+            if cache is None:
+                attn_k = Xk_BxNxSxH
+                attn_v = Xv_BxNxSxH
+            # Decoder Self Attention
+            else:
+                # In prefill mode, we fill in cache until prefill length
+                if prefill:
+                    attn_k, attn_v = Xk_BxNxSxH, Xv_BxNxSxH
+                    cache.prefill_kv(attn_k, attn_v)
+                # In decode step, we add current K/V to cache step by step
+                else:
+                    new_kv_cache = Xk_BxNxSxH, Xv_BxNxSxH
+                    attn_k, attn_v = cache.get_kv_for_attention(Xk_BxNxSxH, Xv_BxNxSxH)
+        attn_output = F.scaled_dot_product_attention(
+            Xq_BxNxTxH,
+            attn_k,
+            attn_v,
+            attn_mask=attn_mask,
+            dropout_p=self.dropout_rate if not deterministic else 0.0,
+            scale=1.0,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()  # (B, T, N, H)
+        output = self.o_proj(attn_output)
+        return output.to(original_dtype), new_kv_cache
+class EncoderLayer(nn.Module):
+    """Transformer Encoder Layer using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        model_config = config.model
+        enc_config = config.model.encoder
+        embed_dim = enc_config.n_embd
+        self.pre_sa_norm = RMSNorm(
+            embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        self.self_attention = Attention(
+            config=config,
+            q_embed_dim=embed_dim,
+            kv_embed_dim=embed_dim,
+            num_query_heads=enc_config.n_head,
+            num_kv_heads=enc_config.n_head,
+            head_dim=enc_config.head_dim,
+            dropout_rate=model_config.dropout,
+            is_cross_attn=False,
+            out_embed_dim=embed_dim,
+        )
+        self.post_sa_norm = RMSNorm(
+            embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        self.mlp = MlpBlock(
+            config=config,
+            embed_dim=embed_dim,
+            intermediate_dim=enc_config.n_hidden,
+            activations=enc_config.mlp_activations,
+            dropout_rate=model_config.dropout,
+            use_pre_norm=enc_config.use_pre_norm,
+        )
+        self.dropout = nn.Dropout(model_config.dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        src_positions: torch.Tensor | None = None,
+        deterministic: bool = True,
+        attn_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        residual = x
+        x_norm = self.pre_sa_norm(x)
+        sa_out, _ = self.self_attention(
+            Xq=x_norm,
+            Xkv=x_norm,
+            q_positions=src_positions,
+            kv_positions=src_positions,
+            deterministic=deterministic,
+            attn_mask=attn_mask,
+        )
+        x = residual + sa_out
+        residual = x
+        x_norm = self.post_sa_norm(x)
+        mlp_out = self.mlp(x_norm, deterministic=deterministic)
+        x = residual + mlp_out
+        if not deterministic:
+            x = self.dropout(x)
+        return x
+class Encoder(nn.Module):
+    """Transformer Encoder Stack using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        model_config = config.model
+        enc_config = config.model.encoder
+        compute_dtype = _str_to_dtype(config.training.dtype)
+        self.embedding = nn.Embedding(
+            model_config.src_vocab_size,
+            enc_config.n_embd,
+            dtype=compute_dtype,
+        )
+        self.dropout = nn.Dropout(model_config.dropout)
+        self.layers = nn.ModuleList([EncoderLayer(config=config) for _ in range(enc_config.n_layer)])
+        self.norm = RMSNorm(
+            enc_config.n_embd,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+    def forward(
+        self,
+        x_ids: torch.Tensor,
+        src_positions: torch.Tensor | None = None,
+        deterministic: bool = True,
+        attn_mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        x = self.embedding(x_ids)
+        if not deterministic:
+            x = self.dropout(x)
+        for layer in self.layers:
+            x = layer(
+                x,
+                src_positions=src_positions,
+                deterministic=deterministic,
+                attn_mask=attn_mask,
+            )
+        x = self.norm(x)
+        if not deterministic:
+            x = self.dropout(x)
+        return x
+class DecoderLayer(nn.Module):
+    """Transformer Decoder Layer using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        model_config = config.model
+        dec_config = config.model.decoder
+        enc_config = config.model.encoder
+        dec_embed_dim = dec_config.n_embd
+        enc_embed_dim = enc_config.n_embd
+        # Norms
+        self.pre_sa_norm = RMSNorm(
+            dec_embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        self.pre_ca_norm = RMSNorm(
+            dec_embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        self.pre_mlp_norm = RMSNorm(
+            dec_embed_dim,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        # Self-Attention (GQA) with Causal Masking
+        self.self_attention = Attention(
+            config=config,
+            q_embed_dim=dec_embed_dim,
+            kv_embed_dim=dec_embed_dim,
+            num_query_heads=dec_config.gqa_query_heads,
+            num_kv_heads=dec_config.kv_heads,
+            head_dim=dec_config.gqa_head_dim,
+            dropout_rate=model_config.dropout,
+            is_cross_attn=False,
+            out_embed_dim=dec_embed_dim,
+        )
+        # Cross-Attention (MHA)
+        self.cross_attention = Attention(
+            config=config,
+            q_embed_dim=dec_embed_dim,
+            kv_embed_dim=enc_embed_dim,  # Note kv_embed_dim
+            num_query_heads=dec_config.cross_query_heads,
+            num_kv_heads=dec_config.cross_query_heads,
+            head_dim=dec_config.cross_head_dim,
+            dropout_rate=model_config.dropout,
+            is_cross_attn=True,
+            out_embed_dim=dec_embed_dim,
+        )
+        # MLP
+        self.mlp = MlpBlock(
+            config=config,
+            embed_dim=dec_embed_dim,
+            intermediate_dim=dec_config.n_hidden,
+            activations=dec_config.mlp_activations,
+            dropout_rate=model_config.dropout,
+            use_pre_norm=dec_config.use_pre_norm,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoder_out: torch.Tensor,
+        tgt_positions: torch.Tensor,
+        src_positions: torch.Tensor | None,
+        deterministic: bool,
+        self_attn_mask: torch.Tensor,
+        cross_attn_mask: torch.Tensor,
+        self_attn_cache: KVCache,
+        cross_attn_cache: KVCache,
+        prefill: bool = False,
+    ) -> torch.Tensor:
+        residual = x
+        x_norm = self.pre_sa_norm(x)
+        sa_out, new_kv_cache = self.self_attention(
+            Xq=x_norm,  # (2, 1, D)
+            Xkv=x_norm,  # (2, 1, D)
+            q_positions=tgt_positions,  # (2, 1)
+            kv_positions=tgt_positions,  # (2, 1)
+            deterministic=deterministic,
+            attn_mask=self_attn_mask,  # (2, 1, 1, S_max)
+            cache=self_attn_cache,
+            prefill=prefill,
+        )
+        x = residual + sa_out
+        # 2. Cross-Attention
+        residual = x
+        x_norm = self.pre_ca_norm(x)
+        ca_out, _ = self.cross_attention(
+            Xq=x_norm,
+            Xkv=encoder_out,
+            q_positions=tgt_positions,
+            kv_positions=src_positions,
+            deterministic=deterministic,
+            attn_mask=cross_attn_mask,
+            cache=cross_attn_cache,
+        )
+        x = residual + ca_out
+        # 3. MLP
+        residual = x
+        x_norm = self.pre_mlp_norm(x)
+        mlp_out = self.mlp(x_norm, deterministic=deterministic)
+        x = residual + mlp_out
+        return x, new_kv_cache
+class Decoder(nn.Module):
+    """Transformer Decoder Stack using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        model_config = config.model
+        dec_config = config.model.decoder
+        train_config = config.training
+        data_config = config.data
+        compute_dtype = _str_to_dtype(config.training.dtype)
+        weight_dtype = _str_to_dtype(config.model.weight_dtype)
+        self.num_channels = data_config.channels
+        self.num_layers = dec_config.n_layer
+        self.embeddings = nn.ModuleList(
+            [
+                nn.Embedding(model_config.tgt_vocab_size, dec_config.n_embd, dtype=compute_dtype)
+                for _ in range(self.num_channels)
+            ]
+        )
+        self.dropout = nn.Dropout(model_config.dropout)
+        self.layers = nn.ModuleList([DecoderLayer(config=config) for _ in range(self.num_layers)])
+        self.norm = RMSNorm(
+            dec_config.n_embd,
+            eps=model_config.normalization_layer_epsilon,
+            dtype=torch.float32,
+        )
+        # Final Logits Projection using DenseGeneral
+        self.logits_dense = DenseGeneral(
+            in_shapes=(dec_config.n_embd,),
+            out_features=(self.num_channels, model_config.tgt_vocab_size),
+            axis=(-1,),
+            dtype=(torch.float32 if train_config.logits_dot_in_fp32 else compute_dtype),
+            weight_dtype=weight_dtype,
+        )
+        self.logits_in_fp32 = train_config.logits_dot_in_fp32
+    def precompute_cross_attention_kv(
+        self,
+        max_len: int,
+        encoder_out: torch.Tensor,  # (B, S, E)
+        src_positions: torch.Tensor | None,  # (B, S)
+    ) -> list[KVCache]:
+        """
+        Computes the Key and Value tensors for cross-attention for each layer from the encoder output.
+        """
+        per_layer_kv_cache: list[KVCache] = []
+        for layer in self.layers:
+            cross_attn_module = layer.cross_attention
+            k_proj = cross_attn_module.k_proj(encoder_out)
+            v_proj = cross_attn_module.v_proj(encoder_out)
+            k_proj = cross_attn_module.rotary_emb(k_proj, position=src_positions)
+            k = k_proj.transpose(1, 2)
+            v = v_proj.transpose(1, 2)
+            per_layer_kv_cache.append(
+                KVCache(
+                    cross_attn_module.num_kv_heads,
+                    max_len,
+                    cross_attn_module.head_dim,
+                    k.device,
+                    k=k,
+                    v=v,
+                )
+            )
+        return per_layer_kv_cache
+    def decode_step(
+        self,
+        tgt_ids_Bx1xC: torch.Tensor,  # [B, 1, C]
+        tgt_pos_Bx1: torch.Tensor,  # [B, 1]
+        encoder_out: torch.Tensor,  # [B, S, E]
+        self_attn_mask: Any,  # None
+        cross_attn_mask: torch.Tensor,  # [B, 1, 1, S]
+        self_attention_cache: list[KVCache],
+        cross_attention_cache: list[KVCache],
+    ) -> torch.Tensor:
+        """
+        Performs a single decoding step, managing KV caches layer by layer.
+        Returns:
+            A tuple containing:
+            - logits_Bx1xCV: The final output logits for the current step (B, 1, C*V), cast to float32.
+        """
+        assert self_attn_mask is None, "Self-attention mask should be None, kept for pattern"
+        x = None
+        for i in range(self.num_channels):
+            channel_tokens = tgt_ids_Bx1xC[..., i]
+            channel_embed = self.embeddings[i](channel_tokens)
+            x = channel_embed if x is None else x + channel_embed
+        new_cache = []
+        for i, layer in enumerate(self.layers):
+            self_cache = self_attention_cache[i]
+            cross_cache = cross_attention_cache[i]
+            x, new_kv_cache = layer(
+                x,  # (2, 1, D)
+                encoder_out,  # (2, S, E)
+                src_positions=None,  # CA KV is already computed
+                tgt_positions=tgt_pos_Bx1,  # (2, 1)
+                deterministic=True,
+                self_attn_mask=None,
+                cross_attn_mask=cross_attn_mask,
+                self_attn_cache=self_cache,
+                cross_attn_cache=cross_cache,
+            )
+            new_cache.append(new_kv_cache)
+        x = self.norm(x)
+        logits_Bx1xCxV = self.logits_dense(x)
+        return logits_Bx1xCxV.to(torch.float32), new_cache
+    def forward(
+        self,
+        tgt_ids_BxTxC: torch.Tensor,
+        encoder_out: torch.Tensor,
+        tgt_positions: torch.Tensor,
+        src_positions: torch.Tensor,
+        deterministic: bool,
+        self_attn_mask: torch.Tensor,
+        cross_attn_mask: torch.Tensor,
+        self_attention_cache: list[KVCache],
+        cross_attention_cache: list[KVCache],
+    ) -> torch.Tensor:
+        """
+        Forward pass for the Decoder stack, managing KV caches.
+        Args:
+            tgt_ids_BxTxC: Target token IDs (B, T, C).
+            encoder_out: Output from the encoder (B, S, E).
+            tgt_positions: Positions for target sequence (B, T).
+            src_positions: Positions for source sequence (B, S).
+            deterministic: Disable dropout if True.
+            self_attn_mask: Mask for self-attention.
+            cross_attn_mask: Mask for cross-attention.
+            past_key_values: List containing the self-attention KV cache for each layer
+                             from the previous decoding step. `len(past_key_values)` should
+                             equal `num_layers`.
+            precomputed_cross_attn_kv: A single tuple containing the pre-computed K/V cache
+                                      derived from `encoder_out`. This is passed identically
+                                      to all layers.
+        Returns:
+            A tuple containing:
+            - logits: The final output logits (B, T, C * V), cast to float32.
+            - present_key_values: A list containing the updated self-attention KV cache
+                                 for each layer for the *current* decoding step.
+        """
+        _, _, num_channels_in = tgt_ids_BxTxC.shape
+        assert num_channels_in == self.num_channels, "Input channels mismatch"
+        # Embeddings
+        x = None
+        for i in range(self.num_channels):
+            channel_tokens = tgt_ids_BxTxC[..., i]
+            channel_embed = self.embeddings[i](channel_tokens)
+            x = channel_embed if x is None else x + channel_embed
+        if not deterministic:
+            x = self.dropout(x)
+        for i, layer in enumerate(self.layers):
+            x, _ = layer(
+                x,
+                encoder_out,
+                tgt_positions=tgt_positions,
+                src_positions=src_positions,
+                deterministic=deterministic,
+                self_attn_mask=self_attn_mask,
+                cross_attn_mask=cross_attn_mask,
+                self_attn_cache=self_attention_cache[i],
+                cross_attn_cache=cross_attention_cache[i],
+                prefill=True,
+            )
+        # Final Norm
+        x = self.norm(x)
+        logits_BxTxCxV = self.logits_dense(x)
+        return logits_BxTxCxV.to(torch.float32)
+class DiaModel(nn.Module):
+    """PyTorch Dia Model using DenseGeneral."""
+    def __init__(self, config: DiaConfig):
+        super().__init__()
+        self.config = config
+        self.encoder = Encoder(config)
+        self.decoder = Decoder(config)
+        #self._init_weights()
+    def _init_weights(self):
+        for module in self.modules():
+            if isinstance(module, (torch.nn.Linear, torch.nn.Conv1d)):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    torch.nn.init.zeros_(module.bias)
+            elif isinstance(module, torch.nn.Embedding):
+                torch.nn.init.xavier_uniform_(module.weight)
+            elif isinstance(module, torch.nn.LayerNorm) or isinstance(module, torch.nn.modules.normalization.RMSNorm):
+                if hasattr(module, 'weight') and module.weight is not None:
+                    torch.nn.init.ones_(module.weight)
+                if hasattr(module, 'bias') and module.bias is not None:
+                    torch.nn.init.zeros_(module.bias)
+    def forward(
+        self,
+        src_BxS: torch.Tensor,
+        tgt_BxTxC: torch.Tensor,
+        src_positions: torch.Tensor | None = None,
+        tgt_positions: torch.Tensor | None = None,
+        enc_self_attn_mask: torch.Tensor | None = None,
+        dec_self_attn_mask: torch.Tensor | None = None,
+        dec_cross_attn_mask: torch.Tensor | None = None,
+        enable_dropout: bool = True,
+    ):
+        deterministic = not enable_dropout
+        # --- Encoder Pass ---
+        encoder_out = self.encoder(
+            x_ids=src_BxS,
+            src_positions=src_positions,
+            deterministic=deterministic,
+            attn_mask=enc_self_attn_mask,
+        )
+        B, T, C = tgt_BxTxC.shape  # Batch size, target sequence length, channels
+        device = tgt_BxTxC.device
+        self_attention_cache = [
+            KVCache(
+                num_heads=self.decoder.layers[i].self_attention.num_query_heads,  # ✅ FIXED: use query heads!
+                max_len=T,
+                head_dim=self.decoder.layers[i].self_attention.head_dim,
+                device=device,
+            )
+            for i in range(self.decoder.num_layers)
+        ]
+        cross_attention_cache = self.decoder.precompute_cross_attention_kv(
+            max_len=encoder_out.shape[1],
+            encoder_out=encoder_out,
+            src_positions=src_positions,
+        )
+        # --- Decoder Pass ---
+        logits = self.decoder(
+            tgt_ids_BxTxC=tgt_BxTxC,
+            encoder_out=encoder_out,
+            tgt_positions=tgt_positions,
+            src_positions=src_positions,
+            deterministic=deterministic,
+            self_attn_mask=dec_self_attn_mask,
+            cross_attn_mask=dec_cross_attn_mask,
+            self_attention_cache=self_attention_cache,
+            cross_attention_cache=cross_attention_cache
+        )
+        return logits

dia/model.py ADDED Viewed

	@@ -0,0 +1,648 @@

+import dac
+import numpy as np
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
+from .audio import audio_to_codebook, codebook_to_audio
+from .config import DiaConfig
+from .layers import DiaModel, KVCache
+def get_default_device():
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+def _sample_next_token(
+    logits_BCxV: torch.Tensor,
+    temperature: float,
+    top_p: float,
+    use_cfg_filter: bool,
+    cfg_filter_top_k: int | None = None,
+) -> torch.Tensor:
+    if temperature == 0.0:
+        return torch.argmax(logits_BCxV, dim=-1)
+    logits_BCxV = logits_BCxV / temperature
+    if use_cfg_filter and cfg_filter_top_k is not None:
+        _, top_k_indices_BCxV = torch.topk(logits_BCxV, k=cfg_filter_top_k, dim=-1)
+        mask = torch.ones_like(logits_BCxV, dtype=torch.bool)
+        mask.scatter_(dim=-1, index=top_k_indices_BCxV, value=False)
+        logits_BCxV = logits_BCxV.masked_fill(mask, -torch.inf)
+    if top_p < 1.0:
+        probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
+        sorted_probs_BCxV, sorted_indices_BCxV = torch.sort(probs_BCxV, dim=-1, descending=True)
+        cumulative_probs_BCxV = torch.cumsum(sorted_probs_BCxV, dim=-1)
+        # Calculate indices to remove based on top_p
+        sorted_indices_to_remove_BCxV = cumulative_probs_BCxV > top_p
+        # Shift the mask to the right to keep the first token above the threshold
+        sorted_indices_to_remove_BCxV[..., 1:] = sorted_indices_to_remove_BCxV[..., :-1].clone()
+        sorted_indices_to_remove_BCxV[..., 0] = 0  # Always keep the most probable token
+        indices_to_remove_BCxV = torch.zeros_like(sorted_indices_to_remove_BCxV)
+        indices_to_remove_BCxV.scatter_(dim=-1, index=sorted_indices_BCxV, src=sorted_indices_to_remove_BCxV)
+        logits_BCxV = logits_BCxV.masked_fill(indices_to_remove_BCxV, -torch.inf)
+    final_probs_BCxV = torch.softmax(logits_BCxV, dim=-1)
+    sampled_indices_BC = torch.multinomial(final_probs_BCxV, num_samples=1)
+    sampled_indices_C = sampled_indices_BC.squeeze(-1)
+    return sampled_indices_C
+class Dia:
+    def __init__(self, config: DiaConfig, device: torch.device | None = None):
+        """Initializes the Dia model.
+        Args:
+            config: The configuration object for the model.
+            device: The device to load the model onto. If None, will automatically select the best available device.
+        Raises:
+            RuntimeError: If there is an error loading the DAC model.
+        """
+        super().__init__()
+        self.config = config
+        self.device = device if device is not None else get_default_device()
+        self.model = DiaModel(config)
+        self.dac_model = None
+    @classmethod
+    def from_local(cls, config_path: str, checkpoint_path: str, device: torch.device | None = None) -> "Dia":
+        """Loads the Dia model from local configuration and checkpoint files.
+        Args:
+            config_path: Path to the configuration JSON file.
+            checkpoint_path: Path to the model checkpoint (.pth) file.
+            device: The device to load the model onto. If None, will automatically select the best available device.
+        Returns:
+            An instance of the Dia model loaded with weights and set to eval mode.
+        Raises:
+            FileNotFoundError: If the config or checkpoint file is not found.
+            RuntimeError: If there is an error loading the checkpoint.
+        """
+        config = DiaConfig.load(config_path)
+        if config is None:
+            raise FileNotFoundError(f"Config file not found at {config_path}")
+        dia = cls(config, device)
+        try:
+            #state_dict = torch.load(checkpoint_path, map_location=dia.device)
+            #dia.model.load_state_dict(state_dict)
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+            if "model" in checkpoint:
+                state_dict = checkpoint["model"]  # lấy riêng phần model
+            else:
+                state_dict = checkpoint
+            dia.model.load_state_dict(state_dict)
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Checkpoint file not found at {checkpoint_path}")
+        except Exception as e:
+            raise RuntimeError(f"Error loading checkpoint from {checkpoint_path}") from e
+        dia.model.to(dia.device)
+        dia.model.eval()
+        dia._load_dac_model()
+        return dia
+    @classmethod
+    def from_pretrained(
+        cls, model_name: str = "nari-labs/Dia-1.6B", device: torch.device | None = None
+    ) -> "Dia":
+        """Loads the Dia model from a Hugging Face Hub repository.
+        Downloads the configuration and checkpoint files from the specified
+        repository ID and then loads the model.
+        Args:
+            model_name: The Hugging Face Hub repository ID (e.g., "NariLabs/Dia-1.6B").
+            device: The device to load the model onto. If None, will automatically select the best available device.
+        Returns:
+            An instance of the Dia model loaded with weights and set to eval mode.
+        Raises:
+            FileNotFoundError: If config or checkpoint download/loading fails.
+            RuntimeError: If there is an error loading the checkpoint.
+        """
+        config_path = hf_hub_download(repo_id=model_name, filename="config.json")
+        checkpoint_path = hf_hub_download(repo_id=model_name, filename="dia-v0_1.pth")
+        return cls.from_local(config_path, checkpoint_path, device)
+    def _load_dac_model(self):
+        try:
+            dac_model_path = dac.utils.download()
+            dac_model = dac.DAC.load(dac_model_path).to(self.device)
+        except Exception as e:
+            raise RuntimeError("Failed to load DAC model") from e
+        self.dac_model = dac_model
+    def _create_attn_mask(
+        self,
+        q_padding_mask_1d: torch.Tensor,
+        k_padding_mask_1d: torch.Tensor,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        """
+        Creates the attention mask (self or cross) mimicking JAX segment ID logic.
+        """
+        B1, Tq = q_padding_mask_1d.shape
+        B2, Tk = k_padding_mask_1d.shape
+        assert B1 == B2, "Query and key batch dimensions must match"
+        p_mask_q = q_padding_mask_1d.unsqueeze(2)  # Shape [B, Tq, 1]
+        p_mask_k = k_padding_mask_1d.unsqueeze(1)  # Shape [B, 1, Tk]
+        # Condition A: Non-padding query attends to non-padding key
+        non_pad_attends_non_pad = p_mask_q & p_mask_k  # Shape [B, Tq, Tk]
+        # Condition B: Padding query attends to padding key
+        pad_attends_pad = (~p_mask_q) & (~p_mask_k)  # Shape [B, Tq, Tk]
+        # Combine: True if padding status is compatible (both non-pad OR both pad)
+        # This implementation follows Jax TPU splash attention kernel
+        mask = non_pad_attends_non_pad | pad_attends_pad  # Shape [B, Tq, Tk]
+        if is_causal:
+            # Ensure causality for self-attention (Tq == Tk)
+            assert Tq == Tk, "Causal mask requires query and key sequence lengths to be equal"
+            # Standard lower-triangular causal mask (True means allow)
+            causal_mask_2d = torch.tril(torch.ones((Tq, Tk), dtype=torch.bool, device=self.device))  # Shape [Tq, Tk]
+            causal_mask = mask & causal_mask_2d  # Shape [B, Tq, Tk]
+            return causal_mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk] for broadcasting across heads
+        else:
+            # For cross-attention or non-causal self-attention
+            return mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk] for broadcasting across heads
+    def _prepare_text_input(self, text: str) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Encodes text prompt, pads, and creates attention mask and positions."""
+        text_pad_value = self.config.data.text_pad_value
+        max_len = self.config.data.text_length
+        byte_text = text.encode("utf-8")
+        replaced_bytes = byte_text
+        LANG2BYTE = {
+            "en": 3,
+            "vi": 19,
+        }
+        CHANNELS = [
+            "5phutcrypto",
+            "anhbanthan",
+            "anhthamtu",
+            "animerewind.official",
+            "bibitv8888",
+            "btvgo",
+            "baclieutv",
+            "bachhoaxanhcom",
+            "baodientuvov",
+            "blvckvines",
+            "boringppl",
+            "bronub",
+            "cdteam-why",
+            "cobabinhduong",
+            "cosmicwriter",
+            "cuthongthai",
+            "daiphatthanhtruyenhinhsonla",
+            "day-be-thong-minh-tv",
+            "danangtv",
+            "daihanoi-htv",
+            "daiptththainguyentntv",
+            "dongmauviet",
+            "dongthaptv",
+            "fptbongdaofficial",
+            "fonosvietnam",
+            "hieurotrong5phut-ntkt",
+            "htvtintuc",
+            "happyhidari",
+            "hoabinhtvgo",
+            "hocenglishonline",
+            "hocvienbovagau",
+            "hungyentvvngo",
+            "huynhduykhuongofficial",
+            "huynhlapofficial",
+            "jvevermind",
+            "kenhvtc16",
+            "kiengiangtv",
+            "khanhvyofficial",
+            "kienthucquansu",
+            "lamdongtv1",
+            "lamvlog",
+            "longantv-la34",
+            "mangovid",
+            "mensbay",
+            "meovatcuocsonglnv",
+            "meuchannel",
+            "ntnvlogsnguyenthanhnam",
+            "ngamradio",
+            "nhanhac555",
+            "nhantaidaiviet",
+            "ptth-trt",
+            "ptvtruyenhinhphutho",
+            "phantichgame",
+            "phephim",
+            "phimhottk-l",
+            "riwaylegal",
+            "ruangao",
+            "suckhoetamsinh",
+            "sachbiquyethanhcong",
+            "soisangbrightsidevietnamese",
+            "spiderum",
+            "spiderumbooks",
+            "sukieskitchen",
+            "tin3phut",
+            "tranthanhtown",
+            "tulemientay",
+            "tayninhtv",
+            "thainhitv",
+            "thanhpahm",
+            "thegioilaptop",
+            "thepresentwriter",
+            "tiengiangtivi",
+            "tieubaobaothom",
+            "tintucbitcoin247",
+            "truyenhinhbinhphuoc-bptv",
+            "truyenhinhyenbaiytv",
+            "truyenhinhcaobang",
+            "truyenhinhdaklakdrt",
+            "truyenhinhdaknong1",
+            "truyenhinhdienbien23.9",
+            "truyenhinhkhanhhoa",
+            "truyenhinhkontumkrt",
+            "truyenhinhnaminhntv",
+            "truyenhinhninhthuan",
+            "truyenhinhquangngai",
+            "tuantienti2911",
+            "tuyenquangttv",
+            "vovlivedoctruyen",
+            "vietcetera",
+            "vinhlongtv",
+            "voizfm",
+            "vutrunguyenthuy",
+            "vuive",
+            "w2wanime",
+            "w2wcartoon",
+            "w2whorror",
+            "w2wmovie",
+            "web5ngay",
+            "xanh24h",
+            "aiphatthanhtruyenhinhquangtri",
+            "aiphatthanhvatruyenhinhhai1908",
+            "altonghop",
+            "antvtruyenhinhcongannhandan",
+            "baihoc10phut",
+            "battlecry.khampha",
+            "betterversionvn",
+            "blogkhoinghiep",
+            "bumcn",
+            "caikinhdi_vn",
+            "canthitg",
+            "chanthienmybachnien",
+            "chauanhchao",
+            "cosu",
+            "cungmaivaobep-monan-amthuc",
+            "daiptthphuyen",
+            "daiptthtv",
+            "daitruyenhinhangiang",
+            "daitruyenhinhbacgiang",
+            "dannytran2375",
+            "daybehoc5489",
+            "daylaphegame",
+            "dienmay",
+            "ducisreal",
+            "duongfg",
+            "duyluandethuong",
+            "duythanhish",
+            "elroydevops",
+            "gc.gamelab",
+            "hacthaybachthay",
+            "hagiangtv475",
+            "haiduongtv247",
+            "hanamtv8831",
+            "hangphimtailieudienanhnd",
+            "haugiangtv",
+            "haunauday",
+            "hieu-tv",
+            "hoshiphan",
+            "jakinatsumi2915",
+            "kechuyentieuhoc1719",
+            "kenhcovan",
+            "khalid_dinh",
+            "kiaralah",
+            "laichautv",
+            "langsontvtube",
+            "megame_official",
+            "minvestvn",
+            "nguoithanhcong1991",
+            "nhatkycuocsong.",
+            "ntcanima",
+            "ptthbentre",
+            "ptthquangbinh",
+            "qrt",
+            "quangninhtv",
+            "snewsvn",
+            "soctrangtv",
+            "sunhuynpodcast",
+            "tamhonanuong",
+            "tgddreview",
+            "thaibinhtv",
+            "thanhnamedu",
+            "thanhnientvnews",
+            "thbrt",
+            "thieunhitv3630",
+            "thtpct",
+            "tinnhanh3phut868",
+            "toansam",
+            "toidicodedaoblog",
+            "tranquochuywecommit",
+            "tranvyvy",
+            "truyenhinh4k",
+            "truyenhinhbinhthuan",
+            "truyenhinhcamau69",
+            "truyenhinhdongnai_dnrtv",
+            "truyenhinhgialai",
+            "truyenhinhlaocai",
+            "truyenhinhnghean",
+            "truyenhinhvinhphuc",
+            "txtofficial8798",
+            "vanhkhuyenle",
+            "vietnh1009",
+            "visaothenhipodcast",
+            "vtc14",
+            "vtcnow",
+            "vtv24",
+            "vuive123",
+            "zombiev4",
+        ]
+        LANG2BYTE.update({ch: 30 + i for i, ch in enumerate(CHANNELS)})
+        # Thay thế tag thành mã byte
+        for tag, byte_val in LANG2BYTE.items():
+            pattern = f"[{tag}]".encode("ascii")  # ví dụ b"[5phutcrypto]"
+            code = bytes([byte_val])              # ví dụ b"\x1e"
+            replaced_bytes = replaced_bytes.replace(pattern, code)
+        text_tokens = list(replaced_bytes)
+        current_len = len(text_tokens)
+        padding_needed = max_len - current_len
+        if padding_needed <= 0:
+            text_tokens = text_tokens[:max_len]
+            padded_text_np = np.array(text_tokens, dtype=np.uint8)
+        else:
+            padded_text_np = np.pad(
+                text_tokens,
+                (0, padding_needed),
+                mode="constant",
+                constant_values=text_pad_value,
+            ).astype(np.uint8)
+        src_tokens = torch.from_numpy(padded_text_np).to(torch.long).to(self.device).unsqueeze(0)  # [1, S]
+        src_positions = torch.arange(max_len, device=self.device).to(torch.long).unsqueeze(0)  # [1, S]
+        src_padding_mask = (src_tokens != text_pad_value).to(self.device)  # [1, S]
+        enc_self_attn_mask = self._create_attn_mask(src_padding_mask, src_padding_mask, is_causal=False)  # [1, S, S]
+        return src_tokens, src_positions, src_padding_mask, enc_self_attn_mask
+    @torch.inference_mode()
+    def generate(
+        self,
+        text: str,
+        max_tokens: int | None = None,
+        cfg_scale: float = 3.0,
+        temperature: float = 1.3,
+        top_p: float = 0.95,
+        use_cfg_filter: bool = True,
+        use_torch_compile: bool = False,
+        cfg_filter_top_k: int = 35,
+        audio_prompt_path: str | None = None,
+    ) -> np.ndarray:
+        """
+        Generates audio from a text prompt (and optional audio prompt) using the Nari model.
+        Returns:
+            A tensor of generated audio codes (shape: [max_tokens, num_channels]).
+        """
+        num_channels = self.config.data.channels
+        audio_bos_value = self.config.data.audio_bos_value
+        audio_eos_value = self.config.data.audio_eos_value
+        audio_pad_value = self.config.data.audio_pad_value
+        delay_pattern = self.config.data.delay_pattern
+        max_tokens = self.config.data.audio_length if max_tokens is None else max_tokens
+        delay_tensor = torch.tensor(delay_pattern, dtype=torch.long, device=self.device)
+        max_delay_pattern = max(delay_pattern)
+        self.model.eval()
+        (
+            cond_src_BxS,
+            cond_src_positions_BxS,
+            cond_src_padding_mask_BxS,
+            cond_enc_self_attn_mask_Bx1xSxS,
+        ) = self._prepare_text_input(text)
+        unc_src_BxS = torch.zeros_like(cond_src_BxS)
+        src_BxS = torch.cat([unc_src_BxS, cond_src_BxS], dim=0)
+        src_positions_BxS = cond_src_positions_BxS.expand(2, -1)
+        src_padding_mask_BxS = cond_src_padding_mask_BxS.expand(2, -1)
+        enc_self_attn_mask_Bx1xSxS = cond_enc_self_attn_mask_Bx1xSxS.expand(2, -1, -1, -1)
+        # 2. Encoder Pass
+        # with torch.autocast(device_type="cuda", dtype=forward_dtype):
+        encoder_out = self.model.encoder(
+            x_ids=src_BxS,
+            src_positions=src_positions_BxS,
+            deterministic=True,
+            attn_mask=enc_self_attn_mask_Bx1xSxS,
+        )  # Shape: (B, S, E)
+        # 3. Prepare Decoder Inputs
+        # 3-1. Allocate KV Cache (Static)
+        decoder_cross_attention_cache: list[KVCache] = self.model.decoder.precompute_cross_attention_kv(
+            max_tokens, encoder_out, src_positions_BxS
+        )
+        decoder_self_attention_cache: list[KVCache] = []
+        for _ in range(self.model.decoder.num_layers):
+            decoder_self_attention_cache.append(
+                KVCache(
+                    self.config.model.decoder.gqa_query_heads,
+                    max_tokens,
+                    self.config.model.decoder.gqa_head_dim,
+                    self.device,
+                )
+            )
+        # 3-2. Initialize Decoder Inputs
+        generated_BxTxC = torch.full(
+            (2, 1, num_channels),
+            fill_value=audio_bos_value,
+            dtype=torch.long,
+            device=self.device,
+        )
+        current_step = 0
+        prompt_len_inc_bos = 1  # Start with BOS length
+        # 3-3. Load Audio Prompt (if provided)
+        if audio_prompt_path is not None:
+            audio_prompt, sr = torchaudio.load(audio_prompt_path, channels_first=True)  # C, T
+            if sr != 44100:  # Resample to 44.1kHz
+                audio_prompt = torchaudio.functional.resample(audio_prompt, sr, 44100)
+            audio_prompt = audio_prompt.to(self.device).unsqueeze(0)  # 1, C, T
+            audio_prompt = audio_to_codebook(self.dac_model, audio_prompt, data_config=self.config.data)
+            print("✅ Prompt shape:", audio_prompt.shape)
+            generated_BxTxC = torch.cat([generated_BxTxC, audio_prompt.expand(2, -1, -1)], dim=1)
+            prefill_len = generated_BxTxC.shape[1]
+            prompt_len_inc_bos = prefill_len
+            prefill_tgt_pos = torch.arange(prefill_len, device=self.device).unsqueeze(0).expand(2, -1)
+            prefill_tgt_padding_mask = (generated_BxTxC != audio_pad_value).any(dim=2)
+            prefill_self_attn_mask = self._create_attn_mask(
+                prefill_tgt_padding_mask,
+                prefill_tgt_padding_mask,
+                is_causal=True,
+            )
+            prefill_cross_attn_mask = self._create_attn_mask(
+                prefill_tgt_padding_mask,
+                src_padding_mask_BxS,
+                is_causal=False,
+            )
+            _ = self.model.decoder.forward(
+                tgt_ids_BxTxC=generated_BxTxC,
+                encoder_out=encoder_out,
+                tgt_positions=prefill_tgt_pos,
+                src_positions=src_positions_BxS,
+                deterministic=True,
+                self_attn_mask=prefill_self_attn_mask,
+                cross_attn_mask=prefill_cross_attn_mask,
+                self_attention_cache=decoder_self_attention_cache,
+                cross_attention_cache=decoder_cross_attention_cache,
+            )
+            current_step = prefill_len - 1
+        # 4. Autoregressive Generation Loop
+        eos_detected_channel_0 = False
+        eos_countdown = -1
+        extra_steps_after_eos = 30
+        # Make generated_BxTxC a fixed size tensor
+        # Length is either 1 + max tokens or 1 + prompt len + max tokens
+        generated_BxTxC = torch.cat(
+            [
+                generated_BxTxC,
+                torch.full(
+                    (2, max_tokens, num_channels),
+                    fill_value=-1,
+                    dtype=torch.long,
+                    device=self.device,
+                ),
+            ],
+            dim=1,
+        )
+        decode_step = self.model.decoder.decode_step
+        if use_torch_compile:
+            decode_step = torch.compile(
+                self.model.decoder.decode_step,
+                mode="default",
+            )
+        tgt_padding_mask = (
+            (generated_BxTxC[:, -1, :].unsqueeze(1) != audio_pad_value).any(dim=2).to(self.device)
+        )  # [B, 1]
+        # Generated tokens are never PAD, so we use fixed mask
+        decoder_cross_attn_mask = self._create_attn_mask(
+            tgt_padding_mask,  # Query mask [B, 1]
+            src_padding_mask_BxS,  # Key mask [B, S]
+            is_causal=False,
+        )  # [B, 1, 1, S]
+        for step in range(current_step, current_step + max_tokens):
+            tgt_ids_Bx1xC = generated_BxTxC[:, step, :].unsqueeze(1)
+            tgt_pos_Bx1 = torch.full(
+                (2, 1),
+                fill_value=step,
+                dtype=torch.long,
+                device=self.device,
+            )
+            logits_Bx1xCxV, new_cache = decode_step(
+                tgt_ids_Bx1xC=tgt_ids_Bx1xC,
+                tgt_pos_Bx1=tgt_pos_Bx1,
+                encoder_out=encoder_out,
+                self_attn_mask=None,
+                cross_attn_mask=decoder_cross_attn_mask,
+                self_attention_cache=decoder_self_attention_cache,
+                cross_attention_cache=decoder_cross_attention_cache,
+            )
+            for i, layer_cache in enumerate(decoder_self_attention_cache):
+                layer_cache.update_cache(new_cache[i][0], new_cache[i][1])
+            V = self.config.model.tgt_vocab_size
+            logits_last_BxCxV = logits_Bx1xCxV[:, -1, :, :]  # B, C, V
+            uncond_logits_CxV = logits_last_BxCxV[0, :, :]
+            cond_logits_CxV = logits_last_BxCxV[1, :, :]
+            cfg_logits_CxV = cond_logits_CxV + cfg_scale * (cond_logits_CxV - uncond_logits_CxV)
+            logits_CxV = cfg_logits_CxV.reshape((-1, V))  # C, V
+            logits_CxV[:, 1025:] = -torch.inf
+            # Sample next token
+            pred_C = _sample_next_token(
+                logits_CxV.float(),
+                temperature=temperature,
+                top_p=top_p,
+                use_cfg_filter=use_cfg_filter,
+                cfg_filter_top_k=cfg_filter_top_k,
+            )
+            generation_step_index = step - current_step
+            if audio_prompt_path is None:
+                pred_C = torch.where(
+                    generation_step_index >= delay_tensor,
+                    pred_C,
+                    audio_bos_value,
+                )
+            generated_BxTxC[:, step + 1, :] = pred_C.unsqueeze(0).expand(2, -1)
+            if not eos_detected_channel_0 and pred_C[0] == audio_eos_value:
+                eos_detected_channel_0 = True
+                eos_countdown = extra_steps_after_eos
+            if eos_countdown > 0:
+                step_after_eos = max_delay_pattern - eos_countdown
+                for i, d in enumerate(delay_pattern):
+                    if step_after_eos == d:
+                        generated_BxTxC[:, step + 1, i] = audio_eos_value
+                    elif step_after_eos > d:
+                        generated_BxTxC[:, step + 1, i] = audio_pad_value
+                eos_countdown -= 1
+                if eos_countdown == 0:
+                    break
+            generation_step_index = step - current_step + 1
+        output_codes = generated_BxTxC[:, prompt_len_inc_bos : step + 1, :]
+        generated_codes = output_codes[0]
+        audio = codebook_to_audio(
+            generated_codes.transpose(1, 0), self.dac_model, delay_pattern, B=1, T=max_tokens, C=num_channels
+        )
+        print("🟩 Tổng số tokens sinh ra:", generated_codes.shape[0])
+        return audio.squeeze().cpu().numpy()