kugelaudio

Runtime error

App Files Files Community

multimodalart HF Staff commited on 22 days ago

Commit

bbb0e68

verified ·

1 Parent(s): 2160ec3

Upload 25 files

Browse files

Files changed (25) hide show

kugelaudio_open/__init__.py +73 -0
kugelaudio_open/cli.py +121 -0
kugelaudio_open/configs/__init__.py +22 -0
kugelaudio_open/configs/kugelaudio_1.5b.json +68 -0
kugelaudio_open/configs/kugelaudio_7b.json +68 -0
kugelaudio_open/configs/model_config.py +290 -0
kugelaudio_open/models/__init__.py +47 -0
kugelaudio_open/models/conv_layers.py +289 -0
kugelaudio_open/models/diffusion_head.py +288 -0
kugelaudio_open/models/kugelaudio_inference.py +800 -0
kugelaudio_open/models/kugelaudio_model.py +721 -0
kugelaudio_open/models/tokenizer.py +1197 -0
kugelaudio_open/processors/__init__.py +10 -0
kugelaudio_open/processors/audio_processor.py +268 -0
kugelaudio_open/processors/kugelaudio_processor.py +366 -0
kugelaudio_open/processors/text_tokenizer.py +93 -0
kugelaudio_open/schedule/__init__.py +5 -0
kugelaudio_open/schedule/dpm_solver.py +1084 -0
kugelaudio_open/ui/__init__.py +5 -0
kugelaudio_open/ui/__main__.py +41 -0
kugelaudio_open/ui/app.py +506 -0
kugelaudio_open/utils/__init__.py +5 -0
kugelaudio_open/utils/generation.py +118 -0
kugelaudio_open/watermark/__init__.py +5 -0
kugelaudio_open/watermark/watermark.py +390 -0

kugelaudio_open/__init__.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""KugelAudio - Open Source Text-to-Speech Model
+KugelAudio is a state-of-the-art neural text-to-speech model that generates
+natural, expressive speech from text with voice cloning capabilities.
+Example:
+    >>> from kugelaudio import KugelAudioForConditionalGenerationInference
+    >>> from transformers import AutoModel
+    >>>
+    >>> # Load the model
+    >>> model = AutoModel.from_pretrained("kugelaudio/kugelaudio-0-open")
+"""
+__version__ = "0.1.0"
+from .configs import (
+    KugelAudioAcousticTokenizerConfig,
+    KugelAudioConfig,
+    KugelAudioDiffusionHeadConfig,
+    KugelAudioSemanticTokenizerConfig,
+)
+from .models import (
+    KugelAudioAcousticTokenizerModel,
+    KugelAudioDiffusionHead,
+    KugelAudioForConditionalGeneration,
+    KugelAudioForConditionalGenerationInference,
+    KugelAudioModel,
+    KugelAudioPreTrainedModel,
+    KugelAudioSemanticTokenizerModel,
+)
+from .processors import KugelAudioProcessor
+from .schedule import DPMSolverMultistepScheduler
+from .watermark import AudioWatermark
+# Lazy imports for optional components
+def launch_ui(*args, **kwargs):
+    """Launch the Gradio web interface."""
+    try:
+        from .ui import launch_ui as _launch_ui
+        return _launch_ui(*args, **kwargs)
+    except ImportError:
+        raise ImportError(
+            "Gradio is required for the web interface. " "Install it with: pip install gradio"
+        )
+__all__ = [
+    # Version
+    "__version__",
+    # Configs
+    "KugelAudioConfig",
+    "KugelAudioAcousticTokenizerConfig",
+    "KugelAudioSemanticTokenizerConfig",
+    "KugelAudioDiffusionHeadConfig",
+    # Models
+    "KugelAudioModel",
+    "KugelAudioPreTrainedModel",
+    "KugelAudioForConditionalGeneration",
+    "KugelAudioForConditionalGenerationInference",
+    "KugelAudioAcousticTokenizerModel",
+    "KugelAudioSemanticTokenizerModel",
+    "KugelAudioDiffusionHead",
+    # Scheduler
+    "DPMSolverMultistepScheduler",
+    # Processors
+    "KugelAudioProcessor",
+    # Watermark
+    "AudioWatermark",
+    # UI
+    "launch_ui",
+]

kugelaudio_open/cli.py ADDED Viewed

	@@ -0,0 +1,121 @@

+#!/usr/bin/env python3
+"""Command-line interface for KugelAudio."""
+import argparse
+import sys
+def main():
+    parser = argparse.ArgumentParser(
+        description="KugelAudio - Open-source text-to-speech",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Launch web interface
+  kugelaudio ui
+  # Launch with public share link
+  kugelaudio ui --share
+  # Generate speech from command line
+  kugelaudio generate "Hello world!" -o output.wav
+  # Check watermark in audio file
+  kugelaudio verify audio.wav
+        """,
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # UI command
+    ui_parser = subparsers.add_parser("ui", help="Launch Gradio web interface")
+    ui_parser.add_argument("--share", action="store_true", help="Create public share link")
+    ui_parser.add_argument("--host", default="127.0.0.1", help="Server hostname")
+    ui_parser.add_argument("--port", type=int, default=7860, help="Server port")
+    # Generate command
+    gen_parser = subparsers.add_parser("generate", help="Generate speech from text")
+    gen_parser.add_argument("text", help="Text to synthesize")
+    gen_parser.add_argument("-o", "--output", default="output.wav", help="Output file path")
+    gen_parser.add_argument("-r", "--reference", help="Reference audio for voice cloning")
+    gen_parser.add_argument("--model", default="kugelaudio/kugelaudio-0-open", help="Model ID")
+    gen_parser.add_argument("--cfg-scale", type=float, default=3.0, help="Guidance scale")
+    # Verify command
+    verify_parser = subparsers.add_parser("verify", help="Check watermark in audio")
+    verify_parser.add_argument("audio", help="Audio file to check")
+    args = parser.parse_args()
+    if args.command == "ui":
+        from kugelaudio_open.ui import launch_app
+        launch_app(
+            share=args.share,
+            server_name=args.host,
+            server_port=args.port,
+        )
+    elif args.command == "generate":
+        import torch
+        from kugelaudio_open.models import KugelAudioForConditionalGenerationInference
+        from kugelaudio_open.processors import KugelAudioProcessor
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        dtype = torch.bfloat16 if device == "cuda" else torch.float32
+        print(f"Loading model {args.model}...")
+        model = KugelAudioForConditionalGenerationInference.from_pretrained(
+            args.model, torch_dtype=dtype
+        ).to(device)
+        model.eval()
+        processor = KugelAudioProcessor.from_pretrained(args.model)
+        # Process inputs (voice_prompt passed to processor for proper handling)
+        inputs = processor(
+            text=args.text,
+            voice_prompt=args.reference,  # Pass reference audio path directly
+            return_tensors="pt"
+        )
+        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        print("Generating speech...")
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                cfg_scale=args.cfg_scale,
+                max_new_tokens=4096,
+            )
+        # Audio is already watermarked by the model's generate method
+        audio = outputs.speech_outputs[0]
+        # Save
+        processor.save_audio(audio, args.output)
+        print(f"Audio saved to {args.output}")
+    elif args.command == "verify":
+        import numpy as np
+        import soundfile as sf
+        from kugelaudio_open.watermark import AudioWatermark
+        audio, sr = sf.read(args.audio)
+        watermark = AudioWatermark()
+        result = watermark.detect(audio, sample_rate=sr)
+        if result.detected:
+            print(f"✅ Watermark DETECTED (confidence: {result.confidence:.1%})")
+            print("This audio was generated by KugelAudio.")
+        else:
+            print(f"❌ No watermark detected (confidence: {result.confidence:.1%})")
+            print("This audio does not appear to be generated by KugelAudio.")
+    else:
+        parser.print_help()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

kugelaudio_open/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""KugelAudio configuration classes."""
+from .model_config import (
+    KugelAudioConfig,
+    KugelAudioAcousticTokenizerConfig,
+    KugelAudioSemanticTokenizerConfig,
+    KugelAudioDiffusionHeadConfig,
+    # Aliases
+    AcousticTokenizerConfig,
+    SemanticTokenizerConfig,
+    DiffusionHeadConfig,
+)
+__all__ = [
+    "KugelAudioConfig",
+    "KugelAudioAcousticTokenizerConfig",
+    "KugelAudioSemanticTokenizerConfig",
+    "KugelAudioDiffusionHeadConfig",
+    "AcousticTokenizerConfig",
+    "SemanticTokenizerConfig",
+    "DiffusionHeadConfig",
+]

kugelaudio_open/configs/kugelaudio_1.5b.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "model_type": "kugelaudio",
+  "_attn_implementation_autoset": true,
+  "acoustic_vae_dim": 64,
+  "tts_backbone_num_hidden_layers": 20,
+  "acoustic_tokenizer_config": {
+    "model_type": "kugelaudio_acoustic_tokenizer",
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "decoder_depths": null,
+    "decoder_n_filters": 32,
+    "decoder_ratios": [8, 5, 5, 4, 2, 2],
+    "disable_last_norm": true,
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [8, 5, 5, 4, 2, 2],
+    "fix_std": 0.5,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "pad_mode": "constant",
+    "std_dist_type": "gaussian",
+    "vae_dim": 64,
+    "weight_init_value": 0.01
+  },
+  "decoder_config": {
+    "model_type": "qwen2",
+    "attention_dropout": 0.0,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "initializer_range": 0.02,
+    "intermediate_size": 8960,
+    "max_position_embeddings": 65536,
+    "max_window_layers": 28,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936
+  },
+  "diffusion_head_config": {
+    "model_type": "kugelaudio_diffusion_head",
+    "ddpm_batch_mul": 4,
+    "ddpm_beta_schedule": "cosine",
+    "ddpm_num_inference_steps": 20,
+    "ddpm_num_steps": 1000,
+    "diffusion_type": "ddpm",
+    "head_ffn_ratio": 3.0,
+    "head_layers": 4,
+    "hidden_size": 1536,
+    "latent_size": 64,
+    "prediction_type": "v_prediction",
+    "rms_norm_eps": 1e-05,
+    "speech_vae_dim": 64
+  },
+  "torch_dtype": "bfloat16"
+}

kugelaudio_open/configs/kugelaudio_7b.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "model_type": "kugelaudio",
+  "_attn_implementation_autoset": true,
+  "acoustic_vae_dim": 64,
+  "tts_backbone_num_hidden_layers": 20,
+  "acoustic_tokenizer_config": {
+    "model_type": "kugelaudio_acoustic_tokenizer",
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "decoder_depths": null,
+    "decoder_n_filters": 32,
+    "decoder_ratios": [8, 5, 5, 4, 2, 2],
+    "disable_last_norm": true,
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [8, 5, 5, 4, 2, 2],
+    "fix_std": 0.5,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "pad_mode": "constant",
+    "std_dist_type": "gaussian",
+    "vae_dim": 64,
+    "weight_init_value": 0.01
+  },
+  "decoder_config": {
+    "model_type": "qwen2",
+    "attention_dropout": 0.0,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 152064
+  },
+  "diffusion_head_config": {
+    "model_type": "kugelaudio_diffusion_head",
+    "ddpm_batch_mul": 4,
+    "ddpm_beta_schedule": "cosine",
+    "ddpm_num_inference_steps": 20,
+    "ddpm_num_steps": 1000,
+    "diffusion_type": "ddpm",
+    "head_ffn_ratio": 3.0,
+    "head_layers": 4,
+    "hidden_size": 3584,
+    "latent_size": 64,
+    "prediction_type": "v_prediction",
+    "rms_norm_eps": 1e-05,
+    "speech_vae_dim": 64
+  },
+  "torch_dtype": "bfloat16"
+}

kugelaudio_open/configs/model_config.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""Configuration classes for KugelAudio models."""
+from typing import Optional, List, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class KugelAudioAcousticTokenizerConfig(PretrainedConfig):
+    """Configuration for the acoustic tokenizer.
+    The acoustic tokenizer converts continuous speech latents back to audio waveforms.
+    It uses a hierarchical convolutional architecture with multiple upsampling stages.
+    """
+    model_type = "kugelaudio_acoustic_tokenizer"
+    def __init__(
+        self,
+        channels: int = 1,
+        corpus_normalize: float = 0.0,
+        causal: bool = True,
+        vae_dim: int = 64,
+        fix_std: float = 0.5,
+        std_dist_type: str = "gaussian",
+        # Common settings
+        mixer_layer: str = "depthwise_conv",
+        conv_norm: str = "none",
+        pad_mode: str = "constant",
+        disable_last_norm: bool = True,
+        layernorm: str = "RMSNorm",
+        layernorm_eps: float = 1e-5,
+        layernorm_elementwise_affine: bool = True,
+        conv_bias: bool = True,
+        layer_scale_init_value: float = 1e-6,
+        weight_init_value: float = 1e-2,
+        # Encoder specific
+        encoder_n_filters: int = 32,
+        encoder_ratios: Optional[List[int]] = None,
+        encoder_depths: str = "3-3-3-3-3-3-8",
+        # Decoder specific
+        decoder_n_filters: int = 32,
+        decoder_ratios: Optional[List[int]] = None,
+        decoder_depths: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.corpus_normalize = corpus_normalize
+        self.causal = causal
+        self.vae_dim = vae_dim
+        self.fix_std = fix_std
+        self.std_dist_type = std_dist_type
+        # Common parameters
+        self.conv_norm = conv_norm
+        self.pad_mode = pad_mode
+        self.layernorm_eps = layernorm_eps
+        self.disable_last_norm = disable_last_norm
+        self.layernorm = layernorm
+        self.layernorm_elementwise_affine = layernorm_elementwise_affine
+        self.conv_bias = conv_bias
+        self.layer_scale_init_value = layer_scale_init_value
+        self.weight_init_value = weight_init_value
+        self.mixer_layer = mixer_layer
+        # Encoder specific parameters
+        self.encoder_n_filters = encoder_n_filters
+        self.encoder_ratios = encoder_ratios if encoder_ratios is not None else [8, 5, 5, 4, 2, 2]
+        self.encoder_depths = encoder_depths
+        # Decoder specific parameters
+        self.decoder_ratios = decoder_ratios if decoder_ratios is not None else self.encoder_ratios
+        self.decoder_n_filters = decoder_n_filters
+        self.decoder_depths = decoder_depths
+class KugelAudioSemanticTokenizerConfig(PretrainedConfig):
+    """Configuration for the semantic tokenizer.
+    The semantic tokenizer extracts semantic features from audio for conditioning.
+    """
+    model_type = "kugelaudio_semantic_tokenizer"
+    def __init__(
+        self,
+        channels: int = 1,
+        corpus_normalize: float = 0.0,
+        causal: bool = True,
+        vae_dim: int = 64,
+        fix_std: float = 0,
+        std_dist_type: str = "none",
+        # Common settings
+        mixer_layer: str = "depthwise_conv",
+        conv_norm: str = "none",
+        pad_mode: str = "constant",
+        disable_last_norm: bool = True,
+        layernorm: str = "RMSNorm",
+        layernorm_eps: float = 1e-5,
+        layernorm_elementwise_affine: bool = True,
+        conv_bias: bool = True,
+        layer_scale_init_value: float = 1e-6,
+        weight_init_value: float = 1e-2,
+        # Encoder specific
+        encoder_n_filters: int = 32,
+        encoder_ratios: Optional[List[int]] = None,
+        encoder_depths: str = "3-3-3-3-3-3-8",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.corpus_normalize = corpus_normalize
+        self.causal = causal
+        self.vae_dim = vae_dim
+        self.fix_std = fix_std
+        self.std_dist_type = std_dist_type
+        # Common parameters
+        self.conv_norm = conv_norm
+        self.pad_mode = pad_mode
+        self.layernorm_eps = layernorm_eps
+        self.disable_last_norm = disable_last_norm
+        self.layernorm = layernorm
+        self.layernorm_elementwise_affine = layernorm_elementwise_affine
+        self.conv_bias = conv_bias
+        self.layer_scale_init_value = layer_scale_init_value
+        self.weight_init_value = weight_init_value
+        self.mixer_layer = mixer_layer
+        # Encoder specific parameters
+        self.encoder_n_filters = encoder_n_filters
+        self.encoder_ratios = encoder_ratios if encoder_ratios is not None else [8, 5, 5, 4, 2, 2]
+        self.encoder_depths = encoder_depths
+class KugelAudioDiffusionHeadConfig(PretrainedConfig):
+    """Configuration for the diffusion prediction head.
+    The diffusion head predicts speech latents from text-conditioned hidden states
+    using a denoising diffusion process.
+    """
+    model_type = "kugelaudio_diffusion_head"
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        head_layers: int = 4,
+        head_ffn_ratio: float = 3.0,
+        rms_norm_eps: float = 1e-5,
+        latent_size: int = 64,
+        speech_vae_dim: Optional[int] = None,
+        prediction_type: str = "v_prediction",
+        diffusion_type: str = "ddpm",
+        ddpm_num_steps: int = 1000,
+        ddpm_num_inference_steps: int = 20,
+        ddpm_beta_schedule: str = "cosine",
+        ddpm_algorithm_type: str = "sde-dpmsolver++",
+        ddpm_batch_mul: int = 4,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.head_layers = head_layers
+        self.head_ffn_ratio = head_ffn_ratio
+        self.rms_norm_eps = rms_norm_eps
+        self.latent_size = latent_size
+        self.speech_vae_dim = speech_vae_dim
+        self.prediction_type = prediction_type
+        self.diffusion_type = diffusion_type
+        self.ddpm_num_steps = ddpm_num_steps
+        self.ddpm_num_inference_steps = ddpm_num_inference_steps
+        self.ddpm_beta_schedule = ddpm_beta_schedule
+        self.ddpm_algorithm_type = ddpm_algorithm_type
+        self.ddpm_batch_mul = ddpm_batch_mul
+        super().__init__(**kwargs)
+class KugelAudioConfig(PretrainedConfig):
+    """Main configuration for KugelAudio TTS model.
+    This configuration combines:
+    - A language model backbone (Qwen2) for text understanding
+    - An acoustic tokenizer for audio encoding/decoding
+    - A semantic tokenizer for semantic feature extraction
+    - A diffusion head for speech latent prediction
+    Example:
+        >>> from kugelaudio import KugelAudioConfig
+        >>> config = KugelAudioConfig.from_pretrained("kugelaudio/kugelaudio-0-open")
+    """
+    model_type = "kugelaudio"
+    is_composition = True
+    sub_configs = {
+        "acoustic_tokenizer_config": KugelAudioAcousticTokenizerConfig,
+        "semantic_tokenizer_config": KugelAudioSemanticTokenizerConfig,
+        "decoder_config": Qwen2Config,
+        "diffusion_head_config": KugelAudioDiffusionHeadConfig,
+    }
+    # Tensor parallel plan for distributed inference
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    def __init__(
+        self,
+        acoustic_tokenizer_config=None,
+        semantic_tokenizer_config=None,
+        decoder_config=None,
+        diffusion_head_config=None,
+        **kwargs,
+    ):
+        # Disable auto attention implementation selection
+        kwargs["_attn_implementation_autoset"] = False
+        # Initialize acoustic tokenizer config
+        if acoustic_tokenizer_config is None:
+            self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"]()
+        elif isinstance(acoustic_tokenizer_config, dict):
+            acoustic_tokenizer_config["model_type"] = "kugelaudio_acoustic_tokenizer"
+            self.acoustic_tokenizer_config = self.sub_configs["acoustic_tokenizer_config"](**acoustic_tokenizer_config)
+        elif isinstance(acoustic_tokenizer_config, KugelAudioAcousticTokenizerConfig):
+            self.acoustic_tokenizer_config = acoustic_tokenizer_config
+        # Initialize semantic tokenizer config
+        if semantic_tokenizer_config is None:
+            self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"]()
+        elif isinstance(semantic_tokenizer_config, dict):
+            semantic_tokenizer_config["model_type"] = "kugelaudio_semantic_tokenizer"
+            self.semantic_tokenizer_config = self.sub_configs["semantic_tokenizer_config"](**semantic_tokenizer_config)
+        elif isinstance(semantic_tokenizer_config, KugelAudioSemanticTokenizerConfig):
+            self.semantic_tokenizer_config = semantic_tokenizer_config
+        # Initialize decoder (language model) config
+        if decoder_config is None:
+            self.decoder_config = self.sub_configs["decoder_config"]()
+        elif isinstance(decoder_config, dict):
+            if decoder_config.get("model_type", "") == "qwen2":
+                self.decoder_config = Qwen2Config(**decoder_config)
+            else:
+                raise ValueError(
+                    f"Unsupported decoder model type: {decoder_config.get('model_type', '')}"
+                )
+        elif isinstance(decoder_config, Qwen2Config):
+            self.decoder_config = decoder_config
+        # Initialize diffusion head config
+        if diffusion_head_config is None:
+            self.diffusion_head_config = self.sub_configs["diffusion_head_config"]()
+        elif isinstance(diffusion_head_config, dict):
+            diffusion_head_config["model_type"] = "kugelaudio_diffusion_head"
+            self.diffusion_head_config = self.sub_configs["diffusion_head_config"](**diffusion_head_config)
+        elif isinstance(diffusion_head_config, KugelAudioDiffusionHeadConfig):
+            self.diffusion_head_config = diffusion_head_config
+        # Derived parameters
+        self.acoustic_vae_dim = self.acoustic_tokenizer_config.vae_dim
+        self.semantic_vae_dim = self.semantic_tokenizer_config.vae_dim
+        super().__init__(**kwargs)
+# Aliases for backwards compatibility
+AcousticTokenizerConfig = KugelAudioAcousticTokenizerConfig
+SemanticTokenizerConfig = KugelAudioSemanticTokenizerConfig
+DiffusionHeadConfig = KugelAudioDiffusionHeadConfig
+__all__ = [
+    "KugelAudioAcousticTokenizerConfig",
+    "KugelAudioSemanticTokenizerConfig",
+    "KugelAudioDiffusionHeadConfig",
+    "KugelAudioConfig",
+    # Aliases
+    "AcousticTokenizerConfig",
+    "SemanticTokenizerConfig",
+    "DiffusionHeadConfig",
+]

kugelaudio_open/models/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""KugelAudio model components."""
+from .kugelaudio_model import (
+    KugelAudioModel,
+    KugelAudioPreTrainedModel,
+    KugelAudioForConditionalGeneration,
+)
+from .kugelaudio_inference import (
+    KugelAudioForConditionalGenerationInference,
+    KugelAudioCausalLMOutputWithPast,
+    KugelAudioGenerationOutput,
+)
+from .tokenizer import (
+    KugelAudioAcousticTokenizerModel,
+    KugelAudioSemanticTokenizerModel,
+    KugelAudioTokenizerEncoderOutput,
+)
+from .diffusion_head import KugelAudioDiffusionHead
+from .conv_layers import (
+    RMSNorm,
+    ConvRMSNorm,
+    ConvLayerNorm,
+    SConv1d,
+    SConvTranspose1d,
+)
+__all__ = [
+    # Main models
+    "KugelAudioModel",
+    "KugelAudioPreTrainedModel",
+    "KugelAudioForConditionalGeneration",
+    "KugelAudioForConditionalGenerationInference",
+    # Outputs
+    "KugelAudioCausalLMOutputWithPast",
+    "KugelAudioGenerationOutput",
+    # Tokenizers
+    "KugelAudioAcousticTokenizerModel",
+    "KugelAudioSemanticTokenizerModel",
+    "KugelAudioTokenizerEncoderOutput",
+    # Components
+    "KugelAudioDiffusionHead",
+    "RMSNorm",
+    "ConvRMSNorm",
+    "ConvLayerNorm",
+    "SConv1d",
+    "SConvTranspose1d",
+]

kugelaudio_open/models/conv_layers.py ADDED Viewed

	@@ -0,0 +1,289 @@

+"""Convolutional layers for KugelAudio tokenizers.
+This module provides the building blocks for the acoustic and semantic tokenizers,
+including streaming-capable convolutions and normalization layers.
+"""
+import math
+import typing as tp
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+# Normalization modules
+class ConvLayerNorm(nn.LayerNorm):
+    """
+    Convolution-friendly LayerNorm that moves channels to last dimensions
+    before running the normalization and moves them back to original position right after.
+    """
+    def __init__(self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs):
+        super().__init__(normalized_shape, **kwargs)
+    def forward(self, x):
+        x = x.transpose(1, 2)  # b ... t -> b t ...
+        x = nn.functional.layer_norm(
+            x.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps
+        ).type_as(x)
+        x = x.transpose(1, 2)  # b t ... -> b ... t
+        return x
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization."""
+    def __init__(self, dim: int, eps: float = 1e-5, elementwise_affine=True, weight_shape=None):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            weight_shape = (dim,) if weight_shape is None else weight_shape
+            self.weight = nn.Parameter(torch.ones(weight_shape))
+        else:
+            self.register_parameter('weight', None)
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        if self.weight is not None:
+            output = output * self.weight
+        return output
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}'
+class ConvRMSNorm(RMSNorm):
+    """Convolution-friendly RMSNorm."""
+    def __init__(self, dim: int, eps: float = 1e-5, elementwise_affine=True, weight_shape=None):
+        super().__init__(dim, eps, elementwise_affine, weight_shape)
+    def forward(self, x):
+        x = x.transpose(1, 2)  # b ... t -> b t ...
+        output = self._norm(x.float()).type_as(x)
+        if self.weight is not None:
+            output = output * self.weight
+        output = output.transpose(1, 2)  # b t ... -> b ... t
+        return output
+# Convolutional layers and utilities
+CONV_NORMALIZATIONS = frozenset(['none', 'weight_norm', 'spectral_norm',
+                                'time_layer_norm', 'layer_norm', 'time_group_norm'])
+def apply_parametrization_norm(module: nn.Module, norm: str = 'none') -> nn.Module:
+    assert norm in CONV_NORMALIZATIONS
+    if norm == 'weight_norm':
+        return nn.utils.weight_norm(module)
+    elif norm == 'spectral_norm':
+        return nn.utils.spectral_norm(module)
+    else:
+        return module
+def get_norm_module(module: nn.Module, causal: bool = False, norm: str = 'none', **norm_kwargs) -> nn.Module:
+    """Return the proper normalization module."""
+    assert norm in CONV_NORMALIZATIONS
+    if norm == 'layer_norm':
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return ConvLayerNorm(module.out_channels, **norm_kwargs)
+    elif norm == 'time_group_norm':
+        if causal:
+            raise ValueError("GroupNorm doesn't support causal evaluation.")
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
+    else:
+        return nn.Identity()
+def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
+                                padding_total: int = 0) -> int:
+    """Calculate extra padding needed for convolution to have the same output length."""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'zero', value: float = 0.):
+    """Pad 1D input with handling for small inputs in reflect mode."""
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == 'reflect':
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left: end]
+class NormConv1d(nn.Module):
+    """Wrapper around Conv1d and normalization applied to this conv."""
+    def __init__(self, *args, causal: bool = False, norm: str = 'none',
+                norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+class NormConvTranspose1d(nn.Module):
+    """Wrapper around ConvTranspose1d and normalization applied to this conv."""
+    def __init__(self, *args, causal: bool = False, norm: str = 'none',
+                norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+class SConv1d(nn.Module):
+    """Conv1d with built-in handling of asymmetric or causal padding and normalization."""
+    def __init__(self, in_channels: int, out_channels: int,
+                kernel_size: int, stride: int = 1, dilation: int = 1,
+                groups: int = 1, bias: bool = True, causal: bool = False,
+                norm: str = 'none', norm_kwargs: tp.Dict[str, tp.Any] = {},
+                pad_mode: str = 'reflect'):
+        super().__init__()
+        self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
+                            dilation=dilation, groups=groups, bias=bias, causal=causal,
+                            norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.pad_mode = pad_mode
+        # Store configuration
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.stride = stride
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        # For non-streaming mode, calculate padding
+        self.padding_total = (kernel_size - 1) * dilation - (stride - 1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass (non-streaming)."""
+        B, C, T = x.shape
+        kernel_size = self.kernel_size
+        stride = self.stride
+        dilation = self.dilation
+        padding_total = self.padding_total
+        # Compute extra padding for stride alignment
+        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+        if self.causal:
+            # Left padding for causal
+            if self.pad_mode == 'constant':
+                x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode, value=0)
+            else:
+                x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Symmetric padding for non-causal
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
+        output = self.conv(x)
+        return output
+class SConvTranspose1d(nn.Module):
+    """ConvTranspose1d with built-in handling of asymmetric or causal padding and normalization."""
+    def __init__(self, in_channels: int, out_channels: int,
+                kernel_size: int, stride: int = 1, causal: bool = False,
+                norm: str = 'none', trim_right_ratio: float = 1.,
+                norm_kwargs: tp.Dict[str, tp.Any] = {}, bias: bool = True):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
+                                        causal=causal, norm=norm, norm_kwargs=norm_kwargs, bias=bias)
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert self.causal or self.trim_right_ratio == 1., \
+            "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
+        assert self.trim_right_ratio >= 0. and self.trim_right_ratio <= 1.
+        # Store configuration
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        # For transposed convolution, padding calculation is different
+        self.padding_total = kernel_size - stride
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass (non-streaming)."""
+        kernel_size = self.kernel_size
+        stride = self.stride
+        padding_total = self.padding_total
+        y = self.convtr(x)
+        # Remove the padding from output
+        if self.causal:
+            # Trim right side for causal
+            padding_right = math.ceil(padding_total * self.trim_right_ratio)
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        else:
+            # Symmetric unpadding for non-causal
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            y = unpad1d(y, (padding_left, padding_right))
+        return y
+__all__ = [
+    "ConvLayerNorm",
+    "RMSNorm",
+    "ConvRMSNorm",
+    "NormConv1d",
+    "NormConvTranspose1d",
+    "SConv1d",
+    "SConvTranspose1d",
+    "pad1d",
+    "unpad1d",
+    "get_extra_padding_for_conv1d",
+]

kugelaudio_open/models/diffusion_head.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.models.auto import AutoModel
+from transformers.modeling_utils import PreTrainedModel
+# from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.activations import ACT2FN
+from transformers.utils import logging
+from ..configs import KugelAudioDiffusionHeadConfig
+logger = logging.get_logger(__name__)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine=True, memory_efficient=False):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.register_parameter('weight', None)
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        if self.weight is not None:
+            output = output * self.weight
+        return output
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}'
+def modulate(x, shift, scale):
+    """Apply modulation to input tensor."""
+    return x * (1 + scale) + shift
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    Args:
+        hidden_size (`int`): Size of the output embedding
+        frequency_embedding_size (`int`, optional): Size of the intermediate frequency embedding
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=False),
+            # nn.SiLU(),
+            ACT2FN['silu'],
+            nn.Linear(hidden_size, hidden_size, bias=False),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        Args:
+            t (`torch.Tensor`): A 1-D Tensor of N indices, one per batch element.
+                            These may be fractional.
+            dim (`int`): The dimension of the output.
+            max_period (`int`, optional): Controls the minimum frequency of the embeddings.
+        Returns:
+            `torch.Tensor`: An [N, D] Tensor of positional embeddings.
+        """
+        half = dim // 2
+        # Create freqs directly on the target device to avoid transfers during CUDA graph capture
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device) / half
+        )
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding.to(t.dtype)
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class FeedForwardNetwork(nn.Module):
+    """
+    Standard feed-forward network with SwiGLU activation.
+    Args:
+        embed_dim (`int`): Input dimension
+        ffn_dim (`int`): Hidden dimension
+    """
+    def __init__(
+        self,
+        embed_dim,
+        ffn_dim,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.gate_proj = nn.Linear(self.embed_dim, ffn_dim, bias=False)
+        self.up_proj = nn.Linear(self.embed_dim, ffn_dim, bias=False)
+        self.down_proj = nn.Linear(ffn_dim, self.embed_dim, bias=False)
+        self.act_fn = ACT2FN['silu']  # Using SiLU as the activation function
+    def forward(self, x):
+        gate = self.gate_proj(x)
+        up = self.up_proj(x)
+        # SwiGLU activation
+        # gate = F.silu(gate)
+        gate = self.act_fn(gate)
+        return self.down_proj(gate * up)
+class HeadLayer(nn.Module):
+    """
+    A layer in the diffusion head.
+    Args:
+        embed_dim (`int`): Input dimension
+        ffn_dim (`int`): Hidden dimension
+        cond_dim (`int`): Condition embedding dimension
+        norm_eps (`float`, optional): Epsilon for normalization
+    """
+    def __init__(
+        self,
+        embed_dim,
+        ffn_dim,
+        cond_dim,
+        norm_eps=1e-5,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.cond_dim = cond_dim
+        self.ffn_dim = ffn_dim
+        self.ffn = FeedForwardNetwork(
+            self.embed_dim,
+            self.ffn_dim,
+        )
+        self.norm = RMSNorm(self.embed_dim, eps=norm_eps)
+        self.adaLN_modulation = nn.Sequential(
+            # nn.SiLU(),
+            ACT2FN['silu'],
+            nn.Linear(cond_dim, 3 * self.embed_dim, bias=False)
+        )
+    def forward(self, x, c):
+        shift_ffn, scale_ffn, gate_ffn = self.adaLN_modulation(c).chunk(3, dim=-1)
+        x = x + gate_ffn * self.ffn(modulate(self.norm(x), shift_ffn, scale_ffn))
+        return x
+class FinalLayer(nn.Module):
+    """
+    Final layer in the diffusion head.
+    Args:
+        hidden_size (`int`): Input dimension
+        output_size (`int`): Output dimension
+        cond_size (`int`): Condition embedding dimension
+        norm_eps (`float`, optional): Epsilon for normalization
+    """
+    def __init__(self, hidden_size, output_size, cond_size, norm_eps=1e-5):
+        super().__init__()
+        self.norm_final = RMSNorm(hidden_size, eps=norm_eps, elementwise_affine=False)
+        self.linear = nn.Linear(hidden_size, output_size, bias=False)
+        self.adaLN_modulation = nn.Sequential(
+            # nn.SiLU(),
+            ACT2FN['silu'],
+            nn.Linear(cond_size, 2 * hidden_size, bias=False)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class KugelAudioDiffusionHead(PreTrainedModel):
+    """
+    Diffusion head model for kugelaudio.
+    Args:
+        config (`KugelAudioDiffusionHeadConfig`): Model configuration
+        latent_size (`int`, optional): Size of the latent space. If not provided, uses `config.latent_size`.
+    """
+    config_class = KugelAudioDiffusionHeadConfig
+    supports_gradient_checkpointing = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def __init__(
+        self,
+        config,
+    ):
+        super().__init__(config)
+        self.config = config
+        self.cond_dim = config.hidden_size
+        latent_size = config.latent_size
+        self.noisy_images_proj = nn.Linear(latent_size, config.hidden_size, bias=False)
+        self.cond_proj = nn.Linear(config.hidden_size, self.cond_dim, bias=False)
+        self.t_embedder = TimestepEmbedder(self.cond_dim)
+        ffn_dim = int(config.hidden_size * config.head_ffn_ratio)
+        # Create the intermediate layers
+        self.layers = nn.ModuleList([
+            HeadLayer(
+                embed_dim=config.hidden_size,
+                ffn_dim=ffn_dim,
+                cond_dim=self.cond_dim,
+                norm_eps=config.rms_norm_eps
+            )
+            for _ in range(config.head_layers)
+        ])
+        # Final layer for output
+        self.final_layer = FinalLayer(
+            hidden_size=config.hidden_size,
+            output_size=latent_size,
+            cond_size=self.cond_dim,
+            norm_eps=config.rms_norm_eps
+        )
+        self.initialize_weights()
+    def initialize_weights(self):
+        """Initialize the weights of the model."""
+        # Initialize timestep embedder
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers
+        for layer in self.layers:
+            nn.init.constant_(layer.adaLN_modulation[-1].weight, 0)
+        # Zero-out output layers
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+    def forward(
+        self,
+        noisy_images,
+        timesteps,
+        condition,
+    ):
+        """
+        Forward pass of the prediction head.
+        Args:
+            noisy_images (`torch.Tensor`): Noisy images/latents to denoise
+            timesteps (`torch.Tensor`): Timesteps for diffusion
+            condition (`torch.Tensor`): Conditioning information
+        Returns:
+            `torch.Tensor`: The predicted noise/velocity
+        """
+        x = self.noisy_images_proj(noisy_images)
+        t = self.t_embedder(timesteps)
+        condition = self.cond_proj(condition)
+        c = condition + t
+        for layer in self.layers:
+            x = layer(x, c)
+        x = self.final_layer(x, c)
+        return x
+AutoModel.register(KugelAudioDiffusionHeadConfig, KugelAudioDiffusionHead)
+__all__ = [
+    "KugelAudioDiffusionHead",
+]

kugelaudio_open/models/kugelaudio_inference.py ADDED Viewed

	@@ -0,0 +1,800 @@

+"""KugelAudio inference model for speech generation.
+This is the open-source inference implementation without optimizations.
+Based on the original VibeVoice model architecture.
+"""
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from transformers import modeling_utils
+from transformers.cache_utils import DynamicCache
+from transformers.generation import (
+    GenerationConfig,
+    GenerationMixin,
+    LogitsProcessor,
+    LogitsProcessorList,
+    StoppingCriteriaList,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto import AutoModel, AutoModelForCausalLM
+from transformers.utils import logging
+from ..configs import KugelAudioConfig
+from ..schedule.dpm_solver import DPMSolverMultistepScheduler
+from .diffusion_head import KugelAudioDiffusionHead
+from .kugelaudio_model import KugelAudioModel, KugelAudioPreTrainedModel
+from .tokenizer import (
+    KugelAudioTokenizerEncoderOutput,
+    KugelAudioTokenizerStreamingCache,
+)
+logger = logging.get_logger(__name__)
+if not hasattr(modeling_utils, "ALL_PARALLEL_STYLES") or modeling_utils.ALL_PARALLEL_STYLES is None:
+    modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none", "colwise", "rowwise"]
+def _get_cache_tensors(cache) -> Tuple[List, List]:
+    """Get key and value cache tensors from a cache object."""
+    if hasattr(cache, "key_cache") and hasattr(cache, "value_cache"):
+        return cache.key_cache, cache.value_cache
+    raise AttributeError(f"Cannot get cache tensors from {type(cache).__name__}")
+@dataclass
+class KugelAudioCausalLMOutputWithPast(BaseModelOutputWithPast):
+    logits: Optional[torch.FloatTensor] = None
+@dataclass
+class KugelAudioGenerationOutput(ModelOutput):
+    """Output type for KugelAudio generation."""
+    sequences: torch.LongTensor = None
+    speech_outputs: Optional[List[torch.FloatTensor]] = None
+class KugelAudioTokenConstraintProcessor(LogitsProcessor):
+    """Constrains token generation to only valid tokens during speech generation."""
+    def __init__(self, valid_token_ids: List[int], device: torch.device = None):
+        self.valid_token_ids = torch.tensor(valid_token_ids, dtype=torch.long, device=device)
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        mask = torch.full_like(scores, float("-inf"))
+        mask[:, self.valid_token_ids] = 0
+        scores = scores + mask
+        return scores
+class KugelAudioForConditionalGenerationInference(KugelAudioPreTrainedModel, GenerationMixin):
+    """KugelAudio model for inference with speech generation capabilities."""
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = KugelAudioModel(config)
+        self.lm_head = nn.Linear(
+            config.decoder_config.hidden_size,
+            config.decoder_config.vocab_size,
+            bias=False,
+        )
+        self.ddpm_inference_steps = config.diffusion_head_config.ddpm_num_inference_steps
+        self.post_init()
+    @property
+    def noise_scheduler(self):
+        return self.model.noise_scheduler
+    @property
+    def prediction_head(self):
+        return self.model.prediction_head
+    @property
+    def speech_scaling_factor(self):
+        return self.model.speech_scaling_factor
+    @property
+    def speech_bias_factor(self):
+        return self.model.speech_bias_factor
+    @property
+    def acoustic_tokenizer(self):
+        return self.model.acoustic_tokenizer
+    @property
+    def semantic_tokenizer(self):
+        return self.model.semantic_tokenizer
+    @property
+    def acoustic_connector(self):
+        return self.model.acoustic_connector
+    @property
+    def semantic_connector(self):
+        return self.model.semantic_connector
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_ddpm_inference_steps(self, num_steps=None):
+        self.ddpm_inference_steps = (
+            num_steps or self.config.diffusion_head_config.ddpm_num_inference_steps
+        )
+    def _process_speech_inputs(
+        self,
+        speech_tensors: Optional[torch.Tensor],
+        speech_masks: Optional[torch.Tensor],
+        voice_cache: Optional[dict] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Process speech inputs through acoustic and semantic encoders.
+        Returns:
+            Tuple of (acoustic_features, speech_embeds) where speech_embeds has shape
+            [num_valid_frames, hidden] - already indexed by speech_masks for direct
+            assignment to inputs_embeds[speech_input_mask].
+        """
+        device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype
+        if voice_cache is not None:
+            # Use pre-encoded voice features
+            acoustic_mean = voice_cache["acoustic_mean"].to(device=device, dtype=dtype)
+            semantic_mean = voice_cache["semantic_mean"].to(device=device, dtype=dtype)
+            # Sample from acoustic distribution
+            fix_std = voice_cache.get("acoustic_std", self.acoustic_tokenizer.fix_std)
+            acoustic_features = acoustic_mean + fix_std * torch.randn_like(acoustic_mean)
+            semantic_features = semantic_mean
+            # Create speech_masks from cache dimensions (all frames valid)
+            batch_size = acoustic_features.shape[0]
+            seq_len = acoustic_features.shape[1]
+            speech_masks = torch.ones(batch_size, seq_len, dtype=torch.bool, device=device)
+        elif speech_tensors is not None:
+            # Encode speech through tokenizers
+            with torch.no_grad():
+                # Acoustic encoding
+                if speech_tensors.dim() == 2:
+                    speech_tensors = speech_tensors.unsqueeze(1)
+                acoustic_output = self.acoustic_tokenizer.encode(speech_tensors)
+                acoustic_features, _ = self.acoustic_tokenizer.sampling(acoustic_output)
+                # Semantic encoding
+                semantic_output = self.semantic_tokenizer.encode(speech_tensors)
+                semantic_features = semantic_output.mean
+            # Create speech_masks if not provided (all frames valid)
+            if speech_masks is None:
+                batch_size = acoustic_features.shape[0]
+                seq_len = acoustic_features.shape[1]
+                speech_masks = torch.ones(batch_size, seq_len, dtype=torch.bool, device=device)
+        else:
+            # Return dummy features
+            vae_dim = self.config.acoustic_vae_dim
+            acoustic_features = torch.zeros(1, 1, vae_dim, device=device, dtype=dtype)
+            semantic_features = torch.zeros(
+                1, 1, self.config.semantic_vae_dim, device=device, dtype=dtype
+            )
+            speech_masks = torch.ones(1, 1, dtype=torch.bool, device=device)
+        # Ensure acoustic and semantic have matching time dimensions
+        acoustic_len = acoustic_features.shape[1]
+        semantic_len = semantic_features.shape[1]
+        if semantic_len < acoustic_len:
+            pad_size = acoustic_len - semantic_len
+            semantic_features = torch.nn.functional.pad(
+                semantic_features, (0, 0, 0, pad_size), mode="constant", value=0
+            )
+        elif semantic_len > acoustic_len:
+            semantic_features = semantic_features[:, :acoustic_len, :]
+        # Apply scaling to acoustic features
+        if not torch.isnan(self.speech_scaling_factor):
+            acoustic_features = (
+                acoustic_features + self.speech_bias_factor
+            ) * self.speech_scaling_factor
+        # Get embeddings through connectors
+        acoustic_embed = self.acoustic_connector(acoustic_features)
+        semantic_embed = self.semantic_connector(semantic_features)
+        # Combine embeddings and index by speech_masks
+        combined_embed = acoustic_embed + semantic_embed
+        # Move speech_masks to CPU for indexing (matches working implementation)
+        speech_embeds = combined_embed[speech_masks.cpu()]
+        return acoustic_features, speech_embeds
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        speech_tensors: Optional[torch.FloatTensor] = None,
+        speech_masks: Optional[torch.BoolTensor] = None,
+        speech_input_mask: Optional[torch.BoolTensor] = None,
+        voice_cache: Optional[dict] = None,
+        logits_to_keep: Union[int, slice] = 0,
+        **kwargs,
+    ) -> Union[Tuple, KugelAudioCausalLMOutputWithPast]:
+        """Forward pass for the model."""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if inputs_embeds is None:
+            inputs_embeds = self.model.get_input_embeddings()(input_ids)
+        # Process speech inputs if provided
+        if voice_cache is not None or (speech_tensors is not None and speech_masks is not None):
+            _, speech_embeds = self._process_speech_inputs(
+                speech_tensors.to(self.dtype) if speech_tensors is not None else None,
+                speech_masks,
+                voice_cache=voice_cache,
+            )
+            if speech_input_mask is not None:
+                inputs_embeds[speech_input_mask] = speech_embeds
+        outputs = self.model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs[0] if not return_dict else outputs.last_hidden_state
+        slice_indices = (
+            slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        )
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        return KugelAudioCausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            last_hidden_state=hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def sample_speech_tokens(
+        self, condition: torch.Tensor, neg_condition: torch.Tensor, cfg_scale: float = 3.0
+    ) -> torch.Tensor:
+        """Sample speech latents using diffusion with classifier-free guidance."""
+        self.model.noise_scheduler.set_timesteps(self.ddpm_inference_steps)
+        if cfg_scale == 1.0:
+            # No CFG - single forward pass
+            speech = torch.randn(condition.shape[0], self.config.acoustic_vae_dim).to(condition)
+            for t in self.model.noise_scheduler.timesteps:
+                eps = self.model.prediction_head(
+                    speech, t.repeat(speech.shape[0]).to(speech), condition=condition
+                )
+                speech = self.model.noise_scheduler.step(eps, t, speech).prev_sample
+            return speech
+        # With CFG - batched forward pass
+        combined_condition = torch.cat([condition, neg_condition], dim=0).to(
+            self.model.prediction_head.device
+        )
+        speech = torch.randn(combined_condition.shape[0], self.config.acoustic_vae_dim).to(
+            combined_condition
+        )
+        for t in self.model.noise_scheduler.timesteps:
+            half = speech[: len(speech) // 2]
+            combined = torch.cat([half, half], dim=0)
+            eps = self.model.prediction_head(
+                combined, t.repeat(combined.shape[0]).to(combined), condition=combined_condition
+            )
+            cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+            half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+            eps = torch.cat([half_eps, half_eps], dim=0)
+            speech = self.model.noise_scheduler.step(eps, t, speech).prev_sample
+        return speech[: len(speech) // 2]
+    @torch.no_grad()
+    def encode_voice_prompt(
+        self,
+        voice_audio: torch.Tensor,
+        sample_rate: int = 24000,
+    ) -> dict:
+        """Pre-encode a voice prompt for caching."""
+        device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype
+        if voice_audio.dim() == 1:
+            voice_audio = voice_audio.unsqueeze(0).unsqueeze(0)
+        elif voice_audio.dim() == 2:
+            voice_audio = voice_audio.unsqueeze(1)
+        voice_audio = voice_audio.to(device=device, dtype=dtype)
+        with torch.no_grad():
+            acoustic_output = self.model.acoustic_tokenizer.encode(voice_audio)
+            semantic_output = self.model.semantic_tokenizer.encode(voice_audio)
+        return {
+            "acoustic_mean": acoustic_output.mean.cpu(),
+            "acoustic_std": getattr(acoustic_output, "std", self.model.acoustic_tokenizer.fix_std),
+            "semantic_mean": semantic_output.mean.cpu(),
+            "audio_length": voice_audio.shape[-1],
+            "sample_rate": sample_rate,
+        }
+    @torch.no_grad()
+    def generate(
+        self,
+        text_ids: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+        voice_prompt: Optional[torch.Tensor] = None,
+        voice_cache: Optional[dict] = None,
+        speech_tensors: Optional[torch.Tensor] = None,
+        speech_masks: Optional[torch.Tensor] = None,
+        speech_input_mask: Optional[torch.Tensor] = None,
+        cfg_scale: float = 3.0,
+        max_new_tokens: int = 2048,
+        do_sample: bool = False,
+        temperature: float = 1.0,
+        show_progress: bool = True,
+        **kwargs,
+    ) -> KugelAudioGenerationOutput:
+        """Generate speech from text.
+        Args:
+            text_ids: Tokenized text input (from processor)
+            input_ids: Alternative name for text_ids
+            voice_prompt: Voice audio tensor for cloning (legacy, use speech_tensors instead)
+            voice_cache: Pre-encoded voice features (from encode_voice_prompt)
+            speech_tensors: Voice audio tensor from processor for cloning
+            speech_masks: Mask indicating valid voice frames
+            speech_input_mask: Boolean mask indicating where to insert voice embeddings
+            cfg_scale: Classifier-free guidance scale (higher = more faithful to text)
+            max_new_tokens: Maximum tokens to generate
+            do_sample: Whether to sample or use greedy decoding
+            temperature: Sampling temperature
+            show_progress: Whether to show progress bar
+        Returns:
+            KugelAudioGenerationOutput with sequences and speech_outputs
+        """
+        device = next(self.parameters()).device
+        dtype = next(self.parameters()).dtype
+        # Handle input_ids vs text_ids
+        if text_ids is None and input_ids is not None:
+            text_ids = input_ids
+        if text_ids is None:
+            raise ValueError("text_ids or input_ids is required")
+        text_ids = text_ids.to(device)
+        batch_size = text_ids.shape[0]
+        # Handle legacy voice_prompt parameter
+        if voice_prompt is not None and speech_tensors is None:
+            speech_tensors = voice_prompt
+            # Create default speech_masks if not provided
+            if speech_masks is None:
+                # Estimate number of frames from audio length
+                audio_len = voice_prompt.shape[-1]
+                num_frames = (audio_len + 3199) // 3200  # compression ratio
+                speech_masks = torch.ones(batch_size, num_frames, dtype=torch.bool, device=device)
+        # Get special token IDs
+        speech_start_id = getattr(self.config, "speech_start_id", None) or 151652
+        speech_end_id = getattr(self.config, "speech_end_id", None) or 151653
+        speech_diffusion_id = getattr(self.config, "speech_diffusion_id", None) or 151654
+        eos_token_id = getattr(self.config.decoder_config, "eos_token_id", None) or 151643
+        # Initialize streaming caches for tokenizers
+        acoustic_cache = KugelAudioTokenizerStreamingCache()
+        semantic_cache = KugelAudioTokenizerStreamingCache()
+        # Initialize sequences and attention masks
+        current_ids = text_ids
+        attention_mask = torch.ones_like(current_ids)
+        # For CFG, create negative prompt (just speech_start token)
+        negative_ids = torch.full((batch_size, 1), speech_start_id, dtype=torch.long, device=device)
+        negative_attention_mask = torch.ones_like(negative_ids)
+        # Storage for generated audio and tracking
+        audio_chunks = [[] for _ in range(batch_size)]
+        finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
+        correct_cnt = torch.zeros(batch_size, dtype=torch.long, device=device)
+        # Get initial embeddings
+        inputs_embeds = self.model.get_input_embeddings()(current_ids)
+        # Process voice/speech input if provided
+        if speech_tensors is not None or voice_cache is not None:
+            # Get speech embeddings
+            if voice_cache is not None:
+                _, speech_embeds = self._process_speech_inputs(
+                    speech_tensors=None,
+                    speech_masks=None,
+                    voice_cache=voice_cache,
+                )
+            else:
+                # Encode speech_tensors directly
+                speech_tensors = speech_tensors.to(device=device, dtype=dtype)
+                if speech_masks is not None:
+                    speech_masks = speech_masks.to(device)
+                _, speech_embeds = self._process_speech_inputs(
+                    speech_tensors=speech_tensors,
+                    speech_masks=speech_masks,
+                    voice_cache=None,
+                )
+            # Insert speech embeddings at positions marked by speech_input_mask
+            # speech_embeds is already flattened to [num_valid_frames, hidden] by _process_speech_inputs
+            if speech_input_mask is not None:
+                speech_input_mask = speech_input_mask.to(device)
+                # Directly assign - shapes should match
+                inputs_embeds[speech_input_mask] = speech_embeds
+        negative_inputs_embeds = self.model.get_input_embeddings()(negative_ids)
+        # Setup logits processor to constrain to valid tokens
+        valid_tokens = [speech_start_id, speech_end_id, speech_diffusion_id, eos_token_id]
+        token_constraint = KugelAudioTokenConstraintProcessor(valid_tokens, device=device)
+        # Initialize KV caches
+        past_key_values = None
+        negative_past_key_values = None
+        # Progress bar
+        progress_iter = (
+            tqdm(range(max_new_tokens), desc="Generating", leave=False)
+            if show_progress
+            else range(max_new_tokens)
+        )
+        for step in progress_iter:
+            if finished.all():
+                break
+            # Forward pass for positive (main) model
+            if past_key_values is None:
+                outputs = self(
+                    inputs_embeds=inputs_embeds,
+                    attention_mask=attention_mask,
+                    use_cache=True,
+                    return_dict=True,
+                )
+            else:
+                outputs = self(
+                    inputs_embeds=inputs_embeds[:, -1:],
+                    attention_mask=attention_mask,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    return_dict=True,
+                )
+            past_key_values = outputs.past_key_values
+            logits = outputs.logits[:, -1, :]
+            # Apply token constraint
+            logits = token_constraint(current_ids, logits)
+            # Sample or greedy decode
+            if do_sample and temperature > 0:
+                probs = torch.softmax(logits / temperature, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(-1)
+            else:
+                next_tokens = torch.argmax(logits, dim=-1)
+            # Force finished samples to output EOS
+            next_tokens = torch.where(
+                finished, torch.tensor(eos_token_id, device=device), next_tokens
+            )
+            # Update sequences
+            current_ids = torch.cat([current_ids, next_tokens.unsqueeze(-1)], dim=-1)
+            attention_mask = torch.cat(
+                [
+                    attention_mask,
+                    torch.ones((batch_size, 1), device=device, dtype=attention_mask.dtype),
+                ],
+                dim=-1,
+            )
+            # Check for EOS tokens
+            eos_mask = (next_tokens == eos_token_id) & ~finished
+            if eos_mask.any():
+                finished = finished | eos_mask
+            # Check for speech_end tokens - mark as finished and clear caches
+            speech_end_mask = (next_tokens == speech_end_id) & ~finished
+            if speech_end_mask.any():
+                finished = finished | speech_end_mask
+                speech_end_indices = speech_end_mask.nonzero(as_tuple=False).squeeze(-1)
+                acoustic_cache.set_to_zero(speech_end_indices)
+                semantic_cache.set_to_zero(speech_end_indices)
+            # Handle speech_start tokens - refresh negative model KV cache
+            speech_start_mask = (next_tokens == speech_start_id) & ~finished
+            if (
+                speech_start_mask.any()
+                and cfg_scale != 1.0
+                and negative_past_key_values is not None
+            ):
+                speech_start_indices = speech_start_mask.nonzero(as_tuple=False).squeeze(-1)
+                if speech_start_indices.dim() == 0:
+                    speech_start_indices = speech_start_indices.unsqueeze(0)
+                for sample_idx in speech_start_indices.tolist():
+                    negative_attention_mask[sample_idx, :] = 0
+                    negative_attention_mask[sample_idx, -1] = 1
+                    key_caches, value_caches = _get_cache_tensors(negative_past_key_values)
+                    for k_cache, v_cache in zip(key_caches, value_caches):
+                        k_cache[sample_idx, :, -1, :] = k_cache[sample_idx, :, 0, :].clone()
+                        v_cache[sample_idx, :, -1, :] = v_cache[sample_idx, :, 0, :].clone()
+                    negative_ids[sample_idx, -1] = speech_start_id
+            # Prepare next input embeddings
+            next_inputs_embeds = self.model.get_input_embeddings()(next_tokens).unsqueeze(1)
+            # Handle diffusion tokens - generate speech
+            diffusion_mask = (next_tokens == speech_diffusion_id) & ~finished
+            if diffusion_mask.any():
+                diffusion_indices = diffusion_mask.nonzero(as_tuple=False).squeeze(-1)
+                if diffusion_indices.dim() == 0:
+                    diffusion_indices = diffusion_indices.unsqueeze(0)
+                # Run negative forward pass for CFG
+                if cfg_scale != 1.0:
+                    if negative_past_key_values is None:
+                        neg_outputs = self(
+                            inputs_embeds=negative_inputs_embeds,
+                            attention_mask=negative_attention_mask,
+                            use_cache=True,
+                            return_dict=True,
+                        )
+                    else:
+                        neg_outputs = self(
+                            inputs_embeds=negative_inputs_embeds[:, -1:],
+                            attention_mask=negative_attention_mask,
+                            past_key_values=negative_past_key_values,
+                            use_cache=True,
+                            return_dict=True,
+                        )
+                    negative_past_key_values = neg_outputs.past_key_values
+                    # Handle non-diffusion samples KV cache correction
+                    non_diffusion_mask = ~diffusion_mask & ~finished
+                    if non_diffusion_mask.any():
+                        non_diffusion_indices = non_diffusion_mask.nonzero(as_tuple=False).squeeze(
+                            -1
+                        )
+                        if non_diffusion_indices.dim() == 0:
+                            non_diffusion_indices = non_diffusion_indices.unsqueeze(0)
+                        key_caches, value_caches = _get_cache_tensors(negative_past_key_values)
+                        for sample_idx in non_diffusion_indices.tolist():
+                            start_idx = correct_cnt[sample_idx].item()
+                            seq_len = negative_attention_mask.shape[1]
+                            if start_idx + 1 < seq_len - 1:
+                                negative_attention_mask[sample_idx, start_idx + 1 :] = (
+                                    negative_attention_mask[sample_idx, start_idx:-1].clone()
+                                )
+                            negative_attention_mask[sample_idx, start_idx] = 0
+                            for k_cache, v_cache in zip(key_caches, value_caches):
+                                if start_idx + 1 < k_cache.shape[2] - 1:
+                                    k_cache[sample_idx, :, start_idx + 1 :, :] = k_cache[
+                                        sample_idx, :, start_idx:-1, :
+                                    ].clone()
+                                    v_cache[sample_idx, :, start_idx + 1 :, :] = v_cache[
+                                        sample_idx, :, start_idx:-1, :
+                                    ].clone()
+                            if start_idx + 1 < negative_ids.shape[1] - 1:
+                                negative_ids[sample_idx, start_idx + 1 :] = negative_ids[
+                                    sample_idx, start_idx:-1
+                                ].clone()
+                        correct_cnt[non_diffusion_indices] += 1
+                    neg_condition = neg_outputs.last_hidden_state[diffusion_indices, -1, :]
+                else:
+                    neg_condition = torch.zeros(
+                        diffusion_indices.shape[0],
+                        self.config.decoder_config.hidden_size,
+                        device=device,
+                        dtype=dtype,
+                    )
+                # Get conditioning from last hidden state
+                condition = outputs.last_hidden_state[diffusion_indices, -1, :]
+                # Sample speech latents using diffusion
+                speech_latents = self.sample_speech_tokens(condition, neg_condition, cfg_scale)
+                # Unscale latents
+                scaled_latent = (
+                    speech_latents / self.speech_scaling_factor - self.speech_bias_factor
+                )
+                # Decode through acoustic tokenizer with streaming cache
+                audio = self.acoustic_tokenizer.decode(
+                    scaled_latent.unsqueeze(1).permute(0, 2, 1),
+                    cache=acoustic_cache,
+                    sample_indices=diffusion_indices,
+                    use_cache=True,
+                )
+                # Store audio chunks
+                for i, idx in enumerate(diffusion_indices.tolist()):
+                    if not finished[idx]:
+                        audio_chunks[idx].append(audio[i].cpu())
+                # Encode audio to semantic features with streaming cache
+                semantic_output = self.semantic_tokenizer.encode(
+                    audio,
+                    cache=semantic_cache,
+                    sample_indices=diffusion_indices,
+                    use_cache=True,
+                )
+                semantic_features = semantic_output.mean
+                # Compute embeddings for next step
+                acoustic_embed = self.acoustic_connector(speech_latents.unsqueeze(1))
+                semantic_embed = self.semantic_connector(semantic_features)
+                diffusion_embeds = (acoustic_embed + semantic_embed).squeeze(1)
+                # Update embeddings for diffusion samples
+                next_inputs_embeds[diffusion_indices] = diffusion_embeds.unsqueeze(1)
+            # Update embeddings for next iteration
+            inputs_embeds = torch.cat([inputs_embeds, next_inputs_embeds], dim=1)
+            # Update negative model
+            negative_inputs_embeds = torch.cat([negative_inputs_embeds, next_inputs_embeds], dim=1)
+            negative_attention_mask = torch.cat(
+                [
+                    negative_attention_mask,
+                    torch.ones((batch_size, 1), device=device, dtype=negative_attention_mask.dtype),
+                ],
+                dim=-1,
+            )
+            negative_ids = torch.cat([negative_ids, next_tokens.unsqueeze(-1)], dim=-1)
+        # Concatenate audio chunks with normalization
+        speech_outputs = []
+        for chunks in audio_chunks:
+            if chunks:
+                concatenated = torch.cat(chunks, dim=-1).squeeze()
+                # Normalize audio to prevent clipping
+                max_val = concatenated.abs().max()
+                if max_val > 1.0:
+                    concatenated = concatenated * (0.95 / max_val)
+                # Apply watermark to all generated audio
+                concatenated = self._apply_watermark(concatenated, sample_rate=24000)
+                speech_outputs.append(concatenated)
+            else:
+                speech_outputs.append(None)
+        return KugelAudioGenerationOutput(
+            sequences=current_ids,
+            speech_outputs=speech_outputs,
+        )
+    def _apply_watermark(self, audio: torch.Tensor, sample_rate: int = 24000) -> torch.Tensor:
+        """Apply imperceptible watermark to generated audio.
+        This watermark identifies audio as generated by KugelAudio and is designed
+        to be robust against various audio transformations while remaining inaudible.
+        """
+        try:
+            import torchaudio.functional as F
+            from audioseal import AudioSeal
+        except ImportError:
+            return audio  # Graceful fallback if audioseal not available
+        device = audio.device
+        dtype = audio.dtype
+        original_shape = audio.shape
+        # Prepare audio for watermarking (AudioSeal expects [batch, channels, samples] at 16kHz)
+        if audio.dim() == 1:
+            audio_for_wm = audio.unsqueeze(0).unsqueeze(0)
+        elif audio.dim() == 2:
+            audio_for_wm = audio.unsqueeze(0)
+        else:
+            audio_for_wm = audio
+        audio_for_wm = audio_for_wm.float()
+        # Resample to 16kHz for AudioSeal
+        if sample_rate != 16000:
+            audio_16k = F.resample(audio_for_wm, sample_rate, 16000)
+        else:
+            audio_16k = audio_for_wm
+        # Load watermark generator (cached after first use)
+        if not hasattr(self, "_wm_generator"):
+            self._wm_generator = AudioSeal.load_generator("audioseal_wm_16bits").to(device)
+            self._wm_generator.eval()
+        # Generate and apply watermark
+        with torch.no_grad():
+            watermark_16k = self._wm_generator.get_watermark(audio_16k.to(device), 16000)
+        # Resample watermark back to original sample rate
+        if sample_rate != 16000:
+            watermark = F.resample(watermark_16k, 16000, sample_rate)
+            # Ensure same length
+            if watermark.shape[-1] != audio_for_wm.shape[-1]:
+                if watermark.shape[-1] > audio_for_wm.shape[-1]:
+                    watermark = watermark[..., : audio_for_wm.shape[-1]]
+                else:
+                    watermark = torch.nn.functional.pad(
+                        watermark, (0, audio_for_wm.shape[-1] - watermark.shape[-1])
+                    )
+        else:
+            watermark = watermark_16k
+        # Add watermark to audio
+        watermarked = audio_for_wm + watermark.to(audio_for_wm.device)
+        # Normalize to prevent clipping
+        max_val = watermarked.abs().max()
+        if max_val > 1.0:
+            watermarked = watermarked * (0.95 / max_val)
+        # Restore original shape
+        if len(original_shape) == 1:
+            watermarked = watermarked.squeeze(0).squeeze(0)
+        elif len(original_shape) == 2:
+            watermarked = watermarked.squeeze(0)
+        return watermarked.to(dtype=dtype)
+# Register with AutoModel
+AutoModel.register(KugelAudioConfig, KugelAudioModel)
+AutoModelForCausalLM.register(KugelAudioConfig, KugelAudioForConditionalGenerationInference)
+__all__ = [
+    "KugelAudioForConditionalGenerationInference",
+    "KugelAudioCausalLMOutputWithPast",
+    "KugelAudioGenerationOutput",
+]

kugelaudio_open/models/kugelaudio_model.py ADDED Viewed

	@@ -0,0 +1,721 @@

+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union, Callable
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from transformers.models.auto import AutoModel, AutoModelForCausalLM
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    CausalLMOutput,
+    BaseModelOutputWithPast,
+    ModelOutput,
+)
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+from transformers import modeling_utils
+from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.utils import logging
+from .tokenizer import (
+    KugelAudioAcousticTokenizerModel,
+    KugelAudioSemanticTokenizerModel,
+)
+from .diffusion_head import KugelAudioDiffusionHead
+from ..schedule.dpm_solver import DPMSolverMultistepScheduler
+from ..configs import KugelAudioConfig
+logger = logging.get_logger(__name__)
+if (
+    not hasattr(modeling_utils, "ALL_PARALLEL_STYLES")
+    or modeling_utils.ALL_PARALLEL_STYLES is None
+):
+    modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none", "colwise", "rowwise"]
+@dataclass
+class KugelAudioCausalLMOutputWithPast(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    diffusion_loss: Optional[torch.FloatTensor] = None
+    speech_token_num: Optional[torch.LongTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+class KugelAudioGenerationOutput(ModelOutput):
+    """
+    Output type for KugelAudio generation.
+    Args:
+        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences.
+        speech_outputs (`List[torch.FloatTensor]`, *optional*):
+            List of generated speech waveforms or latents for each speech segment.
+    """
+    sequences: torch.LongTensor = None
+    speech_outputs: Optional[List[torch.FloatTensor]] = None
+class SpeechConnector(nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.fc1 = nn.Linear(input_dim, output_dim)
+        self.norm = LlamaRMSNorm(output_dim, eps=1e-6)
+        self.fc2 = nn.Linear(output_dim, output_dim)
+    def forward(self, features, **kwargs):
+        x = self.fc1(features)
+        x = self.norm(x)
+        x = self.fc2(x)
+        return x
+# @auto_docstring
+class KugelAudioPreTrainedModel(PreTrainedModel):
+    config_class = KugelAudioConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    def _init_weights(self, module):
+        if isinstance(module, KugelAudioDiffusionHead):
+            module.initialize_weights()
+            return
+        # Use the language model's initializer_range if available
+        if hasattr(self.config, "language_model_config") and hasattr(
+            self.config.language_model_config, "initializer_range"
+        ):
+            std = self.config.language_model_config.initializer_range
+        elif hasattr(self.config, "decoder_config") and hasattr(
+            self.config.decoder_config, "initializer_range"
+        ):
+            std = self.config.decoder_config.initializer_range
+        else:
+            std = 0.02  # Default value
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+# @auto_docstring
+class KugelAudioModel(KugelAudioPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        if hasattr(config, "torch_dtype") and config.torch_dtype is not None:
+            if isinstance(config.torch_dtype, str):
+                dtype = getattr(torch, config.torch_dtype)
+            else:
+                dtype = config.torch_dtype
+        else:
+            dtype = torch.float32
+        # Initialize Qwen2 model for language modeling
+        lm_config = config.decoder_config
+        self.language_model = AutoModel.from_config(lm_config)
+        # Initialize speech components if needed
+        self.acoustic_tokenizer = AutoModel.from_config(
+            config.acoustic_tokenizer_config
+        ).to(dtype)
+        self.semantic_tokenizer = AutoModel.from_config(
+            config.semantic_tokenizer_config
+        ).to(dtype)
+        self.acoustic_connector = SpeechConnector(
+            config.acoustic_vae_dim, lm_config.hidden_size
+        ).to(dtype)
+        self.semantic_connector = SpeechConnector(
+            config.semantic_vae_dim, lm_config.hidden_size
+        ).to(dtype)
+        # Register scaling factors as buffers - use 1D tensors for FSDP compatibility
+        self.register_buffer("speech_scaling_factor", torch.tensor(float("nan")))
+        self.register_buffer("speech_bias_factor", torch.tensor(float("nan")))
+        # Initialize prediction head for speech generation
+        self.prediction_head = AutoModel.from_config(config.diffusion_head_config).to(
+            dtype
+        )
+        # Initialize noise scheduler with SDE-DPM-Solver++ for better quality
+        algorithm_type = getattr(
+            config.diffusion_head_config, "ddpm_algorithm_type", "sde-dpmsolver++"
+        )
+        self.noise_scheduler = DPMSolverMultistepScheduler(
+            num_train_timesteps=config.diffusion_head_config.ddpm_num_steps,
+            beta_schedule=config.diffusion_head_config.ddpm_beta_schedule,
+            prediction_type=config.diffusion_head_config.prediction_type,
+            algorithm_type=algorithm_type,
+            solver_order=2,
+        )
+    def get_input_embeddings(self):
+        if hasattr(self.language_model, "embed_tokens"):
+            # If the language model has an embed_tokens attribute, return it
+            return self.language_model.embed_tokens
+        for (
+            name,
+            attr,
+        ) in (
+            self.language_model.fullmap.items()
+        ):  # parallel by nnscaler, the name is changed
+            if attr.orig_name == "embed_tokens.weight":
+                return getattr(self.language_model, name)
+        assert False, "should not arrive here"
+    def set_input_embeddings(self, value):
+        self.language_model.embed_tokens = value
+    def set_speech_tokenizers(self, acoustic_tokenizer=None, semantic_tokenizer=None):
+        """Set the speech tokenizers used for encoding and decoding speech."""
+        self.acoustic_tokenizer = acoustic_tokenizer
+        self.semantic_tokenizer = semantic_tokenizer
+        # Reset the encoder to evaluation mode
+        if self.acoustic_tokenizer is not None:
+            self.acoustic_tokenizer.eval()
+        if self.semantic_tokenizer is not None:
+            self.semantic_tokenizer.eval()
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device = None,
+        cache_position: torch.Tensor = None,
+        batch_size: int = None,
+        config=None,
+        past_key_values=None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Creates a 4D causal attention mask for use with static cache.
+        This enables torch.compile to work efficiently without recompilation
+        by providing a consistent mask shape during autoregressive generation.
+        Based on the standard HuggingFace implementation without sliding window
+        (KugelAudio doesn't use sliding window attention).
+        Compatible with both old and new transformers API.
+        """
+        # Handle case where attention_mask is already 4D
+        if attention_mask is not None and attention_mask.dim() == 4:
+            return attention_mask
+        # Get device from attention_mask or cache_position if not provided
+        if device is None:
+            if attention_mask is not None:
+                device = attention_mask.device
+            elif cache_position is not None:
+                device = cache_position.device
+            else:
+                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        min_dtype = torch.finfo(dtype).min
+        # Create causal mask: (sequence_length, target_length)
+        causal_mask = torch.full(
+            (sequence_length, target_length),
+            fill_value=min_dtype,
+            dtype=dtype,
+            device=device,
+        )
+        if sequence_length != 1:
+            # Apply upper triangular mask (can't attend to future tokens)
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        # Mask positions beyond current cache position
+        if cache_position is not None:
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        # Expand to 4D: (batch_size, 1, sequence_length, target_length)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        # Combine with input attention mask if provided
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()
+            mask_length = attention_mask.shape[-1]
+            # Create padding mask from attention_mask
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(dtype) * min_dtype
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+        return causal_mask
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # Forward through language model
+        outputs = self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        if not return_dict:
+            return outputs
+        return BaseModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class KugelAudioForConditionalGeneration(KugelAudioPreTrainedModel):
+    """
+    Unified model for both training and inference.
+    Supports:
+    - Training via forward() with loss computation
+    - Inference via generate() for audio generation
+    """
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = KugelAudioModel(config)
+        self.vocab_size = config.decoder_config.vocab_size
+        self.lm_head = nn.Linear(
+            config.decoder_config.hidden_size, self.vocab_size, bias=False
+        )
+        # Inference configuration (for generate() method)
+        self.ddpm_inference_steps = (
+            config.diffusion_head_config.ddpm_num_inference_steps
+            if hasattr(config, "diffusion_head_config")
+            else 5
+        )
+        self.post_init()
+    # Properties for easier access (used by generate())
+    @property
+    def noise_scheduler(self):
+        return self.model.noise_scheduler
+    @property
+    def prediction_head(self):
+        return self.model.prediction_head
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_decoder(self, decoder):
+        self.model.language_model = decoder
+    def get_decoder(self):
+        return self.model.language_model
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings and the output embeddings.
+        """
+        if getattr(self.config.decoder_config, "tie_word_embeddings", False):
+            # The standard PreTrainedModel method will handle the tying.
+            # It typically does a simple parameter object assignment, which is
+            # CORRECT to do BEFORE FSDP wraps the model.
+            output_embeddings = self.get_output_embeddings()
+            input_embeddings = self.get_input_embeddings()
+            if hasattr(input_embeddings, "weight"):
+                output_embeddings.weight = input_embeddings.weight
+            else:
+                # maybe returned input_embeddings a tensor directly
+                output_embeddings.weight = input_embeddings
+            if getattr(output_embeddings, "bias", None) is not None:
+                output_embeddings.bias.data = nn.functional.pad(
+                    output_embeddings.bias.data,
+                    (
+                        0,
+                        output_embeddings.weight.shape[0]
+                        - output_embeddings.bias.shape[0],
+                    ),
+                    "constant",
+                    0,
+                )
+            print("✅ Tied input and output embeddings using standard assignment.")
+        else:
+            print("ℹ️  tie_word_embeddings is False, not tying weights.")
+    # Also, ensure set_output_embeddings is safe, though your implementation looks okay.
+    # The key is to avoid calling it after accelerator.prepare().
+    def set_output_embeddings(self, new_embeddings):
+        # Your current implementation using data.copy_ is good practice,
+        # but the best way is to not call this after prepare().
+        self.lm_head = new_embeddings
+    def forward_speech_features(
+        self,
+        speech_tensors=None,
+        speech_masks=None,
+        speech_type="audio",
+        return_unmask=False,
+    ):
+        if speech_tensors is None:
+            # Use config to get vae_dim instead of non-existent self.args
+            vae_dim = self.config.acoustic_tokenizer_config.vae_dim
+            audio_features = torch.zeros(1, 1, vae_dim).to(
+                self.get_input_embeddings().weight
+            )
+            connect_features = self.model.acoustic_connector(audio_features)
+            return audio_features, connect_features
+        else:
+            with torch.no_grad():
+                if speech_type == "audio":
+                    with torch.no_grad():
+                        frames_out = self.model.acoustic_tokenizer.encode(
+                            speech_tensors.unsqueeze(1)
+                        )
+                        if isinstance(frames_out, (list, tuple)):
+                            frames = frames_out[0][0]
+                        else:
+                            frames = frames_out
+                    audio_tokens = frames.sample(
+                        self.model.acoustic_tokenizer.std_dist_type
+                    )[0]
+                elif speech_type == "vae":
+                    # Use config to get vae_dim instead of non-existent self.args
+                    vae_dim = self.config.acoustic_tokenizer_config.vae_dim
+                    speech_mode = speech_tensors.reshape(
+                        speech_tensors.size(0), -1, vae_dim
+                    )
+                    # gaussian sample from the speech_mode
+                    batch_size = speech_mode.size(0)
+                    value = self.model.acoustic_tokenizer.fix_std / 0.8
+                    std = (
+                        torch.randn(
+                            batch_size,
+                            dtype=speech_mode.dtype,
+                            device=speech_mode.device,
+                        )
+                        * value
+                    )
+                    std = std.view(-1, *[1] * (speech_mode.dim() - 1))
+                    audio_tokens = speech_mode + std * torch.randn(
+                        speech_mode.shape
+                    ).to(speech_mode)
+                else:
+                    raise NotImplementedError(
+                        f"Speech type {speech_type} not implemented"
+                    )
+                if torch.isnan(self.model.speech_scaling_factor) or torch.isnan(
+                    self.model.speech_bias_factor
+                ):
+                    scaling_factor = 1.0 / audio_tokens[speech_masks].flatten().std()
+                    bias_factor = -audio_tokens[speech_masks].flatten().mean()
+                    # Only use distributed operations if the process group is initialized
+                    if dist.is_available() and dist.is_initialized():
+                        dist.all_reduce(scaling_factor, op=dist.ReduceOp.SUM)
+                        dist.all_reduce(bias_factor, op=dist.ReduceOp.SUM)
+                        world_size = dist.get_world_size()
+                        self.model.speech_scaling_factor.copy_(
+                            scaling_factor / world_size
+                        )
+                        self.model.speech_bias_factor.copy_(bias_factor / world_size)
+                        print(
+                            f"Speech scaling factor (distributed): {self.model.speech_scaling_factor}, bias factor: {self.model.speech_bias_factor}",
+                            flush=True,
+                        )
+                    else:
+                        # Single process case
+                        self.model.speech_scaling_factor.copy_(scaling_factor)
+                        self.model.speech_bias_factor.copy_(bias_factor)
+                        print(
+                            f"Speech scaling factor (single process): {self.model.speech_scaling_factor}, bias factor: {self.model.speech_bias_factor}",
+                            flush=True,
+                        )
+                audio_features = (
+                    audio_tokens + self.model.speech_bias_factor
+                ) * self.model.speech_scaling_factor
+            connect_features = self.model.acoustic_connector(audio_features)
+            if return_unmask:
+                return audio_features, connect_features
+            return audio_features[speech_masks], connect_features[speech_masks]
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        # New arguments for speech processing and loss calculation
+        speech_tensors: Optional[torch.FloatTensor] = None,
+        speech_masks: Optional[torch.BoolTensor] = None,
+        speeches_loss_input: Optional[torch.FloatTensor] = None,
+        speech_semantic_tensors: Optional[torch.FloatTensor] = None,
+        acoustic_input_mask: Optional[torch.BoolTensor] = None,
+        acoustic_loss_mask: Optional[torch.BoolTensor] = None,
+        ddpm_batch_mul: int = 1,
+        **kwargs: Optional[Dict[str, Union[torch.Tensor, str]]],
+    ) -> Union[Tuple, KugelAudioCausalLMOutputWithPast]:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        x = self.get_input_embeddings()(input_ids)
+        semantic_speech_all_connect_features = self.model.semantic_connector(
+            speech_semantic_tensors
+        )
+        if speeches_loss_input is not None:
+            # only part audio need diffuse
+            speech_all_features, speech_all_connect_features = (
+                self.forward_speech_features(
+                    speech_tensors=(
+                        speech_tensors.type_as(x)
+                        if speech_tensors is not None
+                        else None
+                    ),
+                    speech_masks=speech_masks,
+                    speech_type=kwargs.get("speech_type", "audio"),
+                    return_unmask=True,
+                )
+            )
+            if speech_tensors is not None:
+                if semantic_speech_all_connect_features is not None:
+                    x[acoustic_input_mask] = (
+                        speech_all_connect_features[speech_masks]
+                        + semantic_speech_all_connect_features[speech_masks]
+                    )
+                else:
+                    x[acoustic_input_mask] = speech_all_connect_features[speech_masks]
+                speech_features = speech_all_features[
+                    speeches_loss_input & speech_masks
+                ]  # only part audio need diffuse
+                speech_connect_features = speech_all_connect_features[
+                    speeches_loss_input & speech_masks
+                ]
+                # Forward-time consistency check: selected latent count should match number of acoustic placeholders
+                try:
+                    if acoustic_input_mask is not None:
+                        assert speech_connect_features.shape[0] == int(
+                            acoustic_input_mask.sum().item()
+                        ), f"Mismatch between selected speech connectors ({speech_connect_features.shape[0]}) and acoustic_input_mask sum ({int(acoustic_input_mask.sum().item())})"
+                except Exception:
+                    pass
+        else:
+            speech_features, speech_connect_features = self.forward_speech_features(
+                speech_tensors=(
+                    speech_tensors.type_as(x) if speech_tensors is not None else None
+                ),
+                speech_masks=speech_masks,
+                speech_type=kwargs.get("speech_type", "audio"),
+            )
+            if speech_tensors is not None:
+                x[acoustic_input_mask] = speech_connect_features
+        outputs = self.model(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=x,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=False,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states)
+        # logits = logits.float()
+        loss = None
+        if labels is not None:
+            # The custom CE loss with masking is calculated in the training script.
+            # We leave the standard loss calculation here as None.
+            pass
+        # --- Diffusion Loss Calculation ---
+        diffusion_loss = None
+        # This block is executed only if we are in a context that involves speech.
+        if speech_tensors is not None and acoustic_loss_mask.sum().item() > 0:
+            # Build conditioning mask from positions whose NEXT token is a speech latent (shift left by 1)
+            cond_mask = torch.zeros_like(acoustic_loss_mask, dtype=torch.bool)
+            cond_mask[:, :-1] = acoustic_loss_mask[:, 1:]
+            cond_mask[:, 0] = False
+            condition_features = hidden_states[cond_mask]
+            speech_len, latent_size = speech_features.shape
+            # Sanity check: ensure 1:1 alignment between selected conditions and latents
+            try:
+                assert (
+                    condition_features.shape[0] == speech_len
+                ), f"Mismatch: condition_features={condition_features.shape[0]} vs speech_features={speech_len}"
+            except Exception:
+                pass
+            noise = torch.randn(
+                (speech_len * ddpm_batch_mul, latent_size),
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
+            )
+            timesteps = torch.multinomial(
+                torch.ones(self.config.diffusion_head_config.ddpm_num_steps),
+                speech_len * ddpm_batch_mul,
+                replacement=True,
+            ).to(hidden_states.device)
+            speech_features_repeated = speech_features.repeat_interleave(
+                ddpm_batch_mul, dim=0
+            )
+            condition_features_repeated = condition_features.repeat_interleave(
+                ddpm_batch_mul, dim=0
+            )
+            noisy_speech_features = self.model.noise_scheduler.add_noise(
+                speech_features_repeated, noise, timesteps
+            )
+            model_output = self.model.prediction_head(
+                noisy_speech_features, timesteps.type_as(x), condition_features_repeated
+            )
+            prediction_type = self.config.diffusion_head_config.prediction_type
+            if prediction_type == "epsilon":
+                target_for_loss = noise
+            elif prediction_type == "v_prediction":
+                target_for_loss = self.model.noise_scheduler.get_velocity(
+                    speech_features_repeated, noise, timesteps
+                )
+            else:
+                raise NotImplementedError(
+                    f"Prediction type {prediction_type} not implemented"
+                )
+            diffusion_loss = F.mse_loss(
+                model_output.float(), target_for_loss.float(), reduction="sum"
+            )
+            if latent_size > 0 and ddpm_batch_mul > 0:
+                # Normalize by latent dim, number of sampled diffusion steps per latent, and number of speech tokens
+                diffusion_loss = (
+                    diffusion_loss / latent_size / ddpm_batch_mul / max(speech_len, 1)
+                )
+            else:
+                diffusion_loss = torch.tensor(0.0, device=diffusion_loss.device)
+        else:
+            # Dummy loss for DDP to work when there are no speech samples in a batch,
+            # but we are in a speech context.
+            diffusion_loss = (
+                sum(p.sum() for p in self.model.prediction_head.parameters()) * 0.0
+            )
+            diffusion_loss += (
+                sum(p.sum() for p in self.model.acoustic_connector.parameters()) * 0.0
+            )
+            diffusion_loss += (
+                sum(p.sum() for p in self.model.semantic_connector.parameters()) * 0.0
+            )
+        # --- End Diffusion Loss Calculation ---
+        if not return_dict:
+            output = (logits, speech_len) + outputs.to_tuple()[1:]
+            return (loss, diffusion_loss) + output
+        return KugelAudioCausalLMOutputWithPast(
+            loss=loss,
+            diffusion_loss=diffusion_loss,
+            speech_token_num=torch.tensor(
+                speech_len if speech_tensors is not None else 0,
+                device=logits.device,
+                dtype=torch.long,
+            ),
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+AutoModel.register(KugelAudioConfig, KugelAudioModel)
+AutoModelForCausalLM.register(KugelAudioConfig, KugelAudioForConditionalGeneration)
+__all__ = [
+    "KugelAudioModel",
+    "KugelAudioPreTrainedModel",
+    "KugelAudioForConditionalGeneration",
+    "KugelAudioCausalLMOutputWithPast",
+    "KugelAudioGenerationOutput",
+]

kugelaudio_open/models/tokenizer.py ADDED Viewed

	@@ -0,0 +1,1197 @@

+import math
+import typing as tp
+from functools import partial
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple, Union
+import copy
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch._dynamo
+from transformers.models.auto import AutoModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.modeling_utils import PreTrainedModel
+from transformers.activations import ACT2FN
+from ..configs import KugelAudioAcousticTokenizerConfig, KugelAudioSemanticTokenizerConfig
+logger = logging.get_logger(__name__)
+# APEX is not used in the open-source version
+APEX_AVAILABLE = False
+# Normalization modules
+class ConvLayerNorm(nn.LayerNorm):
+    """
+    Convolution-friendly LayerNorm that moves channels to last dimensions
+    before running the normalization and moves them back to original position right after.
+    """
+    def __init__(self, normalized_shape: tp.Union[int, tp.List[int], torch.Size], **kwargs):
+        super().__init__(normalized_shape, **kwargs)
+    def forward(self, x):
+        x = x.transpose(1, 2)  # b ... t -> b t ...
+        x = nn.functional.layer_norm(x.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps).type_as(x)
+        x = x.transpose(1, 2)  # b t ... -> b ... t
+        return x
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5, elementwise_affine=True, weight_shape=None):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            weight_shape = (dim,) if weight_shape is None else weight_shape
+            self.weight = nn.Parameter(torch.ones(weight_shape))
+        else:
+            self.register_parameter('weight', None)
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        if self.weight is not None:
+            output = output * self.weight
+        return output
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, eps={self.eps}, elementwise_affine={self.elementwise_affine}'
+class ConvRMSNorm(RMSNorm):
+    def __init__(self, dim: int, eps: float = 1e-5, elementwise_affine=True, weight_shape=None):
+        super().__init__(dim, eps, elementwise_affine, weight_shape)
+    def forward(self, x):
+        x = x.transpose(1, 2)  # b ... t -> b t ...
+        if (not APEX_AVAILABLE) or (not self.elementwise_affine):
+            # Fallback to native implementation
+            output = self._norm(x.float()).type_as(x)
+            if self.weight is not None:
+                output = output * self.weight
+        else:
+            output = fused_rms_norm_affine(x, self.weight, self.weight.shape, self.eps)
+        output = output.transpose(1, 2)  # b t ... -> b ... t
+        return output
+# Convolutional layers and utilities
+CONV_NORMALIZATIONS = frozenset(['none', 'weight_norm', 'spectral_norm',
+                                'time_layer_norm', 'layer_norm', 'time_group_norm'])
+def apply_parametrization_norm(module: nn.Module, norm: str = 'none') -> nn.Module:
+    assert norm in CONV_NORMALIZATIONS
+    if norm == 'weight_norm':
+        return nn.utils.weight_norm(module)
+    elif norm == 'spectral_norm':
+        return nn.utils.spectral_norm(module)
+    else:
+        # We already check was in CONV_NORMALIZATION, so any other choice
+        # doesn't need reparametrization.
+        return module
+def get_norm_module(module: nn.Module, causal: bool = False, norm: str = 'none', **norm_kwargs) -> nn.Module:
+    """Return the proper normalization module. If causal is True, this will ensure the returned
+    module is causal, or return an error if the normalization doesn't support causal evaluation.
+    """
+    assert norm in CONV_NORMALIZATIONS
+    if norm == 'layer_norm':
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return ConvLayerNorm(module.out_channels, **norm_kwargs)
+    elif norm == 'time_group_norm':
+        if causal:
+            raise ValueError("GroupNorm doesn't support causal evaluation.")
+        assert isinstance(module, nn.modules.conv._ConvNd)
+        return nn.GroupNorm(1, module.out_channels, **norm_kwargs)
+    else:
+        return nn.Identity()
+def get_extra_padding_for_conv1d(x: torch.Tensor, kernel_size: int, stride: int,
+                                padding_total: int = 0) -> int:
+    """Calculate extra padding needed for convolution to have the same output length"""
+    length = x.shape[-1]
+    n_frames = (length - kernel_size + padding_total) / stride + 1
+    ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)
+    return ideal_length - length
+def pad1d(x: torch.Tensor, paddings: tp.Tuple[int, int], mode: str = 'zero', value: float = 0.):
+    """Pad 1D input with handling for small inputs in reflect mode"""
+    length = x.shape[-1]
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    if mode == 'reflect':
+        max_pad = max(padding_left, padding_right)
+        extra_pad = 0
+        if length <= max_pad:
+            extra_pad = max_pad - length + 1
+            x = F.pad(x, (0, extra_pad))
+        padded = F.pad(x, paddings, mode, value)
+        end = padded.shape[-1] - extra_pad
+        return padded[..., :end]
+    else:
+        return F.pad(x, paddings, mode, value)
+def unpad1d(x: torch.Tensor, paddings: tp.Tuple[int, int]):
+    """Remove padding from x, handling properly zero padding. Only for 1d!"""
+    padding_left, padding_right = paddings
+    assert padding_left >= 0 and padding_right >= 0, (padding_left, padding_right)
+    assert (padding_left + padding_right) <= x.shape[-1]
+    end = x.shape[-1] - padding_right
+    return x[..., padding_left: end]
+class NormConv1d(nn.Module):
+    """Wrapper around Conv1d and normalization applied to this conv"""
+    def __init__(self, *args, causal: bool = False, norm: str = 'none',
+                norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.conv = apply_parametrization_norm(nn.Conv1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.conv, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        return x
+class NormConvTranspose1d(nn.Module):
+    """Wrapper around ConvTranspose1d and normalization applied to this conv"""
+    def __init__(self, *args, causal: bool = False, norm: str = 'none',
+                norm_kwargs: tp.Dict[str, tp.Any] = {}, **kwargs):
+        super().__init__()
+        self.convtr = apply_parametrization_norm(nn.ConvTranspose1d(*args, **kwargs), norm)
+        self.norm = get_norm_module(self.convtr, causal, norm, **norm_kwargs)
+        self.norm_type = norm
+    def forward(self, x):
+        x = self.convtr(x)
+        x = self.norm(x)
+        return x
+class KugelAudioTokenizerStreamingCache:
+    """Cache for streaming convolution, similar to KV cache in attention"""
+    def __init__(self):
+        self.cache = {}  # Dict mapping (layer_id, sample_idx) to state tensor
+    def get(self, layer_id: str, sample_indices: torch.Tensor) -> Optional[torch.Tensor]:
+        """Get cached states for given layer and sample indices"""
+        states = []
+        max_length = 0
+        # First pass: collect states and find max length
+        for idx in sample_indices.tolist():
+            key = (layer_id, idx)
+            if key not in self.cache:
+                return None  # If any sample is missing, return None
+            state = self.cache[key]
+            states.append(state)
+            max_length = max(max_length, state.shape[-1])
+        # Second pass: pad states to max length if needed
+        if len(states) > 0 and states[0].dim() >= 2:
+            padded_states = []
+            for state in states:
+                if state.shape[-1] < max_length:
+                    # Pad on the time dimension (last dimension)
+                    pad_size = max_length - state.shape[-1]
+                    # Pad with zeros on the LEFT to align the most recent samples
+                    padded_state = F.pad(state, (pad_size, 0), mode='constant', value=0)
+                    padded_states.append(padded_state)
+                else:
+                    padded_states.append(state)
+            return torch.stack(padded_states, dim=0)
+        else:
+            return torch.stack(states, dim=0)
+    def set(self, layer_id: str, sample_indices: torch.Tensor, states: torch.Tensor):
+        """Set cached states for given layer and sample indices"""
+        for i, idx in enumerate(sample_indices.tolist()):
+            key = (layer_id, idx)
+            self.cache[key] = states[i].detach()
+    def set_to_zero(self, sample_indices: torch.Tensor):
+        """Set all cached states to zero for given sample indices"""
+        for key in list(self.cache.keys()):
+            layer_id, sample_idx = key
+            if sample_idx in sample_indices.tolist():
+                # Create zero tensor with same shape and dtype as cached tensor
+                cached_tensor = self.cache[key]
+                self.cache[key] = torch.zeros_like(cached_tensor)
+    def clear(self, layer_id: Optional[str] = None, sample_indices: Optional[torch.Tensor] = None):
+        """Clear cache for specific layer/samples or everything"""
+        if layer_id is None and sample_indices is None:
+            self.cache.clear()
+        elif layer_id is not None and sample_indices is None:
+            # Clear all samples for a specific layer
+            keys_to_remove = [k for k in self.cache.keys() if k[0] == layer_id]
+            for k in keys_to_remove:
+                del self.cache[k]
+        elif layer_id is not None and sample_indices is not None:
+            # Clear specific samples for a specific layer
+            for idx in sample_indices.tolist():
+                key = (layer_id, idx)
+                self.cache.pop(key, None)
+class SConv1d(nn.Module):
+    """Conv1d with built-in handling of asymmetric or causal padding and normalization."""
+    def __init__(self, in_channels: int, out_channels: int,
+                kernel_size: int, stride: int = 1, dilation: int = 1,
+                groups: int = 1, bias: bool = True, causal: bool = False,
+                norm: str = 'none', norm_kwargs: tp.Dict[str, tp.Any] = {},
+                pad_mode: str = 'reflect'):
+        super().__init__()
+        self.conv = NormConv1d(in_channels, out_channels, kernel_size, stride,
+                            dilation=dilation, groups=groups, bias=bias, causal=causal,
+                            norm=norm, norm_kwargs=norm_kwargs)
+        self.causal = causal
+        self.pad_mode = pad_mode
+        # Store configuration
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.stride = stride
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        # For causal convolution, we need to maintain kernel_size - 1 samples as context
+        # need to check use which context_size is more suitable
+        # self.context_size = (kernel_size - 1) * dilation
+        self.context_size = (kernel_size - 1) * dilation - (stride - 1)
+        # For non-streaming mode, calculate padding
+        self.padding_total = (kernel_size - 1) * dilation - (stride - 1)
+        # Create a unique layer ID for cache management
+        self._layer_id = None
+    @property
+    def layer_id(self):
+        if self._layer_id is None:
+            self._layer_id = f"sconv1d_{id(self)}"
+        return self._layer_id
+    def forward(self, x: torch.Tensor,
+                cache: Optional[KugelAudioTokenizerStreamingCache] = None,
+                sample_indices: Optional[torch.Tensor] = None,
+                use_cache: bool = False,
+                debug: bool = False) -> torch.Tensor:
+        """
+        Forward pass with optional streaming support via cache.
+        Args:
+            x: Input tensor [batch_size, channels, time]
+            cache: KugelAudioTokenizerStreamingCache object for maintaining states
+            sample_indices: Indices identifying each sample for cache management
+            use_cache: Whether to use cached states for streaming
+            debug: Whether to print debug information
+        Returns:
+            Output tensor
+        """
+        B, C, T = x.shape
+        # Non-streaming mode
+        if not use_cache or cache is None:
+            return self._forward_non_streaming(x, debug=debug)
+        # Streaming mode
+        assert self.causal, "Streaming mode is only supported for causal convolutions"
+        assert sample_indices is not None, "sample_indices must be provided for streaming mode"
+        assert len(sample_indices) == B, "sample_indices must match batch size"
+        return self._forward_streaming(x, cache, sample_indices, debug)
+    @torch._dynamo.disable()  # Disable compilation for streaming path - dynamic cache ops cause recompilations
+    def _forward_streaming(self, x: torch.Tensor,
+                          cache: KugelAudioTokenizerStreamingCache,
+                          sample_indices: torch.Tensor,
+                          debug: bool = False) -> torch.Tensor:
+        """Streaming forward pass with cache operations kept separate from compiled code"""
+        B, C, T = x.shape
+        # Cache operations (not compiled)
+        cached_states = cache.get(self.layer_id, sample_indices)
+        if cached_states is None:
+            # First chunk - initialize with zeros for context
+            if self.context_size > 0:
+                cached_states = torch.zeros(B, C, self.context_size, device=x.device, dtype=x.dtype)
+                if debug:
+                    print(f"[DEBUG] Initialized cache with shape: {cached_states.shape}, context_size={self.context_size}")
+            else:
+                cached_states = torch.zeros(B, C, 0, device=x.device, dtype=x.dtype)
+                if debug:
+                    print(f"[DEBUG] No context needed (kernel_size=stride)")
+        # Concatenate cached states with input
+        if cached_states.shape[2] > 0:
+            input_with_context = torch.cat([cached_states, x], dim=2)
+        else:
+            input_with_context = x
+        if debug:
+            print(f"[DEBUG] Input shape: {x.shape}, Cache shape: {cached_states.shape}, Combined: {input_with_context.shape}")
+        # Apply convolution directly - no extra padding in streaming mode
+        # The conv layer will handle its own padding internally
+        output = self.conv(input_with_context)
+        if debug:
+            print(f"[DEBUG] Output shape: {output.shape}")
+        # Update cache for next chunk
+        if self.context_size > 0:
+            # Calculate how many samples to keep
+            total_input_length = input_with_context.shape[2]
+            # Keep the last context_size samples
+            if total_input_length >= self.context_size:
+                new_cache_start = total_input_length - self.context_size
+                new_cache = input_with_context[:, :, new_cache_start:]
+            else:
+                # If we have less than context_size samples, keep everything
+                new_cache = input_with_context
+            if debug:
+                print(f"[DEBUG] New cache shape: {new_cache.shape}")
+            cache.set(self.layer_id, sample_indices, new_cache)
+        return output
+    def _forward_non_streaming(self, x: torch.Tensor, debug: bool = False) -> torch.Tensor:
+        """Standard forward pass without streaming"""
+        B, C, T = x.shape
+        kernel_size = self.kernel_size
+        stride = self.stride
+        dilation = self.dilation
+        padding_total = self.padding_total
+        # Ensure weight is on the same device as input
+        if hasattr(self, "conv") and hasattr(self.conv, "conv") and hasattr(self.conv.conv, "weight"):
+             if self.conv.conv.weight.device != x.device:
+                 self.conv.conv.to(x.device)
+        # Compute extra padding for stride alignment
+        extra_padding = get_extra_padding_for_conv1d(x, kernel_size, stride, padding_total)
+        if debug:
+            print(f"[DEBUG NON-STREAMING] Input shape: {x.shape}, padding_total={padding_total}, extra_padding={extra_padding}")
+        if self.causal:
+            # Left padding for causal
+            if self.pad_mode == 'constant':
+                x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode, value=0)
+            else:
+                x = pad1d(x, (padding_total, extra_padding), mode=self.pad_mode)
+        else:
+            # Symmetric padding for non-causal
+            padding_right = padding_total // 2
+            padding_left = padding_total - padding_right
+            x = pad1d(x, (padding_left, padding_right + extra_padding), mode=self.pad_mode)
+        if debug:
+            print(f"[DEBUG NON-STREAMING] After padding: {x.shape}")
+        output = self.conv(x)
+        if debug:
+            print(f"[DEBUG NON-STREAMING] Output shape: {output.shape}")
+        return output
+class SConvTranspose1d(nn.Module):
+    """ConvTranspose1d with built-in handling of asymmetric or causal padding and normalization."""
+    def __init__(self, in_channels: int, out_channels: int,
+                kernel_size: int, stride: int = 1, causal: bool = False,
+                norm: str = 'none', trim_right_ratio: float = 1.,
+                norm_kwargs: tp.Dict[str, tp.Any] = {}, bias: bool = True):
+        super().__init__()
+        self.convtr = NormConvTranspose1d(in_channels, out_channels, kernel_size, stride,
+                                        causal=causal, norm=norm, norm_kwargs=norm_kwargs, bias=bias)
+        self.causal = causal
+        self.trim_right_ratio = trim_right_ratio
+        assert self.causal or self.trim_right_ratio == 1., \
+            "`trim_right_ratio` != 1.0 only makes sense for causal convolutions"
+        assert self.trim_right_ratio >= 0. and self.trim_right_ratio <= 1.
+        # Store configuration
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        # For transposed convolution, padding calculation is different
+        self.padding_total = kernel_size - stride
+        # For streaming, we need to keep track of input history
+        # Transposed conv needs to see multiple input samples to produce correct output
+        self.context_size = kernel_size - 1
+        # Create a unique layer ID for cache management
+        self._layer_id = None
+    @property
+    def layer_id(self):
+        if self._layer_id is None:
+            self._layer_id = f"sconvtr1d_{id(self)}"
+        return self._layer_id
+    def forward(self, x: torch.Tensor,
+                cache: Optional[KugelAudioTokenizerStreamingCache] = None,
+                sample_indices: Optional[torch.Tensor] = None,
+                use_cache: bool = False,
+                debug: bool = False) -> torch.Tensor:
+        """
+        Forward pass with optional streaming support via cache.
+        """
+        B, C, T = x.shape
+        # Non-streaming mode
+        if not use_cache or cache is None:
+            return self._forward_non_streaming(x, debug=debug)
+        # Streaming mode
+        assert sample_indices is not None, "sample_indices must be provided for streaming mode"
+        assert len(sample_indices) == B, "sample_indices must match batch size"
+        return self._forward_streaming(x, cache, sample_indices, debug)
+    @torch._dynamo.disable()  # Disable compilation for streaming path - dynamic cache ops cause recompilations
+    def _forward_streaming(self, x: torch.Tensor,
+                          cache: KugelAudioTokenizerStreamingCache,
+                          sample_indices: torch.Tensor,
+                          debug: bool = False) -> torch.Tensor:
+        """Streaming forward pass with cache operations kept separate from compiled code"""
+        B, C, T = x.shape
+        # Cache operations (not compiled)
+        cached_input = cache.get(self.layer_id, sample_indices)
+        if cached_input is None:
+            # First chunk - no history yet
+            cached_input = torch.zeros(B, C, 0, device=x.device, dtype=x.dtype)
+            if debug:
+                print(f"[DEBUG] Initialized empty cache for transposed conv")
+        # Concatenate cached input with new input
+        full_input = torch.cat([cached_input, x], dim=2)
+        if debug:
+            print(f"[DEBUG] Input shape: {x.shape}, Cache shape: {cached_input.shape}, Combined: {full_input.shape}")
+        # First chunk or debug mode - use uncompiled version
+        full_output = self.convtr(full_input)
+        if debug:
+            print(f"[DEBUG] Full transposed conv output shape: {full_output.shape}")
+        # Calculate padding to remove
+        if self.causal:
+            padding_right = math.ceil(self.padding_total * self.trim_right_ratio)
+            padding_left = self.padding_total - padding_right
+        else:
+            padding_right = self.padding_total // 2
+            padding_left = self.padding_total - padding_right
+        # Remove padding
+        if padding_left + padding_right > 0:
+            full_output = unpad1d(full_output, (padding_left, padding_right))
+        if debug:
+            print(f"[DEBUG] After unpadding: {full_output.shape}")
+        # Determine which part of the output corresponds to the new input
+        if cached_input.shape[2] == 0:
+            # First chunk - return all output
+            output = full_output
+        else:
+            # Subsequent chunks - return only the new output
+            expected_new_output = T * self.stride
+            # Take the last expected_new_output samples
+            if full_output.shape[2] >= expected_new_output:
+                output = full_output[:, :, -expected_new_output:]
+            else:
+                output = full_output
+        if debug:
+            print(f"[DEBUG] Final streaming output shape: {output.shape}")
+        # Update cache
+        if full_input.shape[2] > self.context_size:
+            new_cache = full_input[:, :, -self.context_size:]
+        else:
+            new_cache = full_input
+        if debug:
+            print(f"[DEBUG] New cache shape: {new_cache.shape}")
+        cache.set(self.layer_id, sample_indices, new_cache)
+        return output
+    def _forward_non_streaming(self, x: torch.Tensor, debug: bool = False) -> torch.Tensor:
+        """Standard forward pass without streaming"""
+        # Ensure weight is on the same device as input
+        if hasattr(self, "convtr") and hasattr(self.convtr, "convtr") and hasattr(self.convtr.convtr, "weight"):
+             if self.convtr.convtr.weight.device != x.device:
+                 self.convtr.convtr.to(x.device)
+        if debug:
+            print(f"[DEBUG NON-STREAMING] Input shape: {x.shape}")
+        # Apply transposed convolution
+        y = self.convtr(x)
+        if debug:
+            print(f"[DEBUG NON-STREAMING] After transposed conv: {y.shape}")
+        # Calculate and remove padding
+        if self.causal:
+            padding_right = math.ceil(self.padding_total * self.trim_right_ratio)
+            padding_left = self.padding_total - padding_right
+        else:
+            padding_right = self.padding_total // 2
+            padding_left = self.padding_total - padding_right
+        if padding_left + padding_right > 0:
+            y = unpad1d(y, (padding_left, padding_right))
+        if debug:
+            print(f"[DEBUG NON-STREAMING] Final output shape: {y.shape}")
+        return y
+# FFN
+class FFN(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        ffn_dim,
+        bias=False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.linear1 = nn.Linear(self.embed_dim, ffn_dim, bias=bias)
+        self.gelu = ACT2FN["gelu"]
+        self.linear2 = nn.Linear(ffn_dim, self.embed_dim, bias=bias)
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.gelu(x)
+        x = self.linear2(x)
+        return x
+class Convlayer(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            dilation=1,
+            groups=1,
+            bias=True,
+            pad_mode='zeros',
+            norm='weight_norm',
+            causal=True,
+        ):
+        super().__init__()
+        self.conv = SConv1d(in_channels, out_channels, kernel_size, stride=stride, dilation=dilation,
+                           groups=groups, bias=bias, pad_mode=pad_mode, norm=norm, causal=causal)
+    def forward(self, x):
+        return self.conv(x)
+class Block1D(nn.Module):
+    def __init__(self, dim, kernel_size=7, drop_path=0., mixer_layer='conv',
+                layer_scale_init_value=1e-6, **kwargs):
+        super().__init__()
+        if kwargs.get('layernorm', 'LN') == 'LN':
+            self.norm = ConvLayerNorm(dim, eps=kwargs.get('eps', 1e-6))
+            self.ffn_norm = ConvLayerNorm(dim, eps=kwargs.get('eps', 1e-6))
+        elif kwargs.get('layernorm', 'RMSNorm') == 'RMSNorm':
+            self.norm = ConvRMSNorm(dim, eps=kwargs.get('eps', 1e-6))
+            self.ffn_norm = ConvRMSNorm(dim, eps=kwargs.get('eps', 1e-6))
+        if mixer_layer == 'conv':
+            self.mixer = Convlayer(dim, dim, groups=kwargs.get('groups', 1),
+                                kernel_size=kernel_size,
+                                pad_mode=kwargs.get('pad_mode', 'reflect'),
+                                norm=kwargs.get('norm', 'none'),
+                                causal=kwargs.get('causal', True),
+                                bias=kwargs.get('bias', True),
+                                )
+        elif mixer_layer == 'depthwise_conv':
+            self.mixer = Convlayer(dim, dim, groups=dim,
+                                kernel_size=kernel_size,
+                                pad_mode=kwargs.get('pad_mode', 'reflect'),
+                                norm=kwargs.get('norm', 'none'),
+                                causal=kwargs.get('causal', True),
+                                bias=kwargs.get('bias', True),
+                                )
+        else:
+            raise ValueError(f"Unsupported mixer layer: {mixer_layer}")
+        self.ffn = FFN(
+            dim,
+            kwargs.get('ffn_expansion', 4) * dim,
+            bias=kwargs.get('bias', False),
+        )
+        self.drop_path = nn.Identity() if drop_path <= 0. else nn.modules.DropPath(drop_path)
+        if layer_scale_init_value > 0:
+            self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+            self.ffn_gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma = None
+            self.ffn_gamma = None
+    def forward(self, x):
+        # mixer
+        residual = x
+        x = self.norm(x)
+        x = self.mixer(x)
+        if self.gamma is not None:
+            x = x * self.gamma.unsqueeze(-1)
+        x = residual + self.drop_path(x)
+        # ffn
+        residual = x
+        x = self.ffn_norm(x)
+        x = x.permute(0, 2, 1)
+        x = self.ffn(x)
+        x = x.permute(0, 2, 1)
+        if self.ffn_gamma is not None:
+            x = x * self.ffn_gamma.unsqueeze(-1)
+        x = residual + self.drop_path(x)
+        return x
+class TokenizerEncoder(nn.Module):
+    """
+    Encoder component for the KugelAudio tokenizer that converts audio to latent representations.
+    Args:
+        config: Configuration object with model parameters
+    """
+    def __init__(self, config):
+        super().__init__()
+        # Extract parameters from config
+        self.channels = config.channels
+        self.dimension = config.dimension
+        self.n_filters = config.n_filters
+        self.ratios = list(reversed(config.ratios))
+        self.depths = config.depths
+        self.n_residual_layers = getattr(config, "n_residual_layers", 1)
+        self.hop_length = np.prod(self.ratios)
+        self.causal = config.causal
+        # Additional config parameters with defaults
+        kernel_size = getattr(config, "kernel_size", 7)
+        last_kernel_size = getattr(config, "last_kernel_size", 7)
+        norm = getattr(config, "norm", "none")
+        norm_params = getattr(config, "norm_params", {})
+        pad_mode = getattr(config, "pad_mode", "reflect")
+        bias = getattr(config, "bias", True)
+        layernorm = getattr(config, "layernorm", "LN")
+        layernorm_eps = getattr(config, "layernorm_eps", 1e-6)
+        layernorm_elementwise_affine = getattr(config, "layernorm_elementwise_affine", True)
+        drop_path_rate = getattr(config, "drop_path_rate", 0.0)
+        mixer_layer = getattr(config, "mixer_layer", "conv")
+        layer_scale_init_value = getattr(config, "layer_scale_init_value", 0)
+        disable_last_norm = getattr(config, "disable_last_norm", False)
+        # determine the norm type based on layernorm
+        if layernorm == 'LN':
+            norm_type = ConvLayerNorm
+        elif layernorm == 'RMSNorm':
+            norm_type = partial(ConvRMSNorm, elementwise_affine=layernorm_elementwise_affine)
+        else:
+            raise ValueError(f"Unsupported norm type: {layernorm}")
+        # stem and intermediate downsampling conv layers
+        stem = nn.Sequential(
+                SConv1d(self.channels, self.n_filters, kernel_size, norm=norm, norm_kwargs=norm_params, causal=self.causal, pad_mode=pad_mode, bias=bias),
+            )
+        self.downsample_layers = nn.ModuleList()
+        self.downsample_layers.append(stem)
+        for i in range(len(self.ratios)):
+            in_ch = self.n_filters * (2 ** i)
+            out_ch = self.n_filters * (2 ** (i + 1))
+            downsample_layer = nn.Sequential(
+                SConv1d(in_ch, out_ch, kernel_size=self.ratios[i] * 2, stride=self.ratios[i], causal=self.causal, pad_mode=pad_mode, norm=norm, bias=bias)
+            )
+            self.downsample_layers.append(downsample_layer)
+        # configure the transformer blocks
+        layer_type = partial(
+            Block1D,
+            mixer_layer=mixer_layer,
+            layernorm=layernorm,
+            eps=layernorm_eps,
+            causal=self.causal,
+            pad_mode=pad_mode,
+            norm=norm,
+            bias=bias,
+            layer_scale_init_value=layer_scale_init_value,
+        )
+        self.stages = nn.ModuleList()
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
+        cur = 0
+        for i in range(len(self.depths)):
+            in_ch = self.n_filters * (2 ** i)
+            stage = nn.Sequential(
+                *[layer_type(dim=in_ch, drop_path=dp_rates[cur + j]) for j in range(self.depths[i])]
+            )
+            self.stages.append(stage)
+            cur += self.depths[i]
+        if not disable_last_norm:
+            self.norm = norm_type(in_ch, eps=layernorm_eps)
+        else:
+            self.norm = nn.Identity()
+        self.head = SConv1d(in_ch, self.dimension, kernel_size=last_kernel_size, causal=self.causal, pad_mode=pad_mode, norm=norm, bias=bias)
+    def forward_features(self, x, cache=None, sample_indices=None, use_cache=False, debug=False):
+        for i in range(len(self.depths)):
+            # Apply downsampling
+            for layer in self.downsample_layers[i]:
+                if isinstance(layer, SConv1d):
+                    x = layer(x, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+                else:
+                    x = layer(x)
+            # Apply stage (Block1D contains Convlayer which contains SConv1d)
+            for block in self.stages[i]:
+                if hasattr(block, 'mixer') and hasattr(block.mixer, 'conv') and isinstance(block.mixer.conv, SConv1d):
+                    # Block1D forward with cache support
+                    residual = x
+                    x = block.norm(x)
+                    x = block.mixer.conv(x, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+                    if block.gamma is not None:
+                        x = x * block.gamma.unsqueeze(-1)
+                    x = residual + x
+                    # FFN part
+                    residual = x
+                    x = block.ffn_norm(x)
+                    x = x.permute(0, 2, 1)
+                    x = block.ffn(x)
+                    x = x.permute(0, 2, 1)
+                    if block.ffn_gamma is not None:
+                        x = x * block.ffn_gamma.unsqueeze(-1)
+                    x = residual + x
+                else:
+                    x = block(x)
+        return self.norm(x)
+    def forward(self, x, cache=None, sample_indices=None, use_cache=False, debug=False):
+        x = self.forward_features(x, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        x = self.head(x, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        return x
+class TokenizerDecoder(nn.Module):
+    """
+    Decoder component for the KugelAudio tokenizer that converts latent representations back to audio.
+    Args:
+        config: Configuration object with model parameters
+    """
+    def __init__(self, config):
+        super().__init__()
+        # Extract parameters from config
+        self.dimension = config.dimension
+        self.channels = config.channels
+        self.n_filters = config.n_filters
+        self.ratios = config.ratios
+        # IMPORTANT CHANGE: Don't reverse depths again since they're already reversed in KugelAudioAcousticTokenizerModel
+        self.depths = config.depths  # Changed from list(reversed(config.depths))
+        self.n_residual_layers = getattr(config, "n_residual_layers", 1)
+        self.hop_length = np.prod(self.ratios)
+        self.causal = config.causal
+        # Additional config parameters with defaults
+        kernel_size = getattr(config, "kernel_size", 7)
+        last_kernel_size = getattr(config, "last_kernel_size", 7)
+        norm = getattr(config, "norm", "none")
+        norm_params = getattr(config, "norm_params", {})
+        pad_mode = getattr(config, "pad_mode", "reflect")
+        bias = getattr(config, "bias", True)
+        layernorm = getattr(config, "layernorm", "LN")
+        layernorm_eps = getattr(config, "layernorm_eps", 1e-6)
+        trim_right_ratio = getattr(config, "trim_right_ratio", 1.0)
+        layernorm_elementwise_affine = getattr(config, "layernorm_elementwise_affine", True)
+        drop_path_rate = getattr(config, "drop_path_rate", 0.0)
+        mixer_layer = getattr(config, "mixer_layer", "conv")
+        layer_scale_init_value = getattr(config, "layer_scale_init_value", 0)
+        disable_last_norm = getattr(config, "disable_last_norm", False)
+        # determine the norm type based on layernorm
+        if layernorm == 'LN':
+            norm_type = ConvLayerNorm
+        elif layernorm == 'RMSNorm':
+            norm_type = partial(ConvRMSNorm, elementwise_affine=layernorm_elementwise_affine)
+        else:
+            raise ValueError(f"Unsupported norm type: {layernorm}")
+        # stem and upsampling layers
+        stem = nn.Sequential(
+                SConv1d(self.dimension, self.n_filters * 2 ** (len(self.depths) - 1), kernel_size, norm=norm,
+                        norm_kwargs=norm_params, causal=self.causal, pad_mode=pad_mode, bias=bias),
+            )
+        self.upsample_layers = nn.ModuleList()
+        self.upsample_layers.append(stem)
+        for i in range(len(self.ratios)):
+            in_ch = self.n_filters * (2 ** (len(self.depths) - 1 - i))
+            out_ch = self.n_filters * (2 ** (len(self.depths) - 1 - i - 1))
+            upsample_layer = nn.Sequential(
+                SConvTranspose1d(in_ch, out_ch,
+                                kernel_size=self.ratios[i] * 2, stride=self.ratios[i],
+                                norm=norm, norm_kwargs=norm_params, bias=bias,
+                                causal=self.causal, trim_right_ratio=trim_right_ratio),
+            )
+            self.upsample_layers.append(upsample_layer)
+        # configure transformer blocks
+        layer_type = partial(
+            Block1D,
+            mixer_layer=mixer_layer,
+            layernorm=layernorm,
+            eps=layernorm_eps,
+            causal=self.causal,
+            pad_mode=pad_mode,
+            norm=norm,
+            bias=bias,
+            layer_scale_init_value=layer_scale_init_value,
+        )
+        self.stages = nn.ModuleList()
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
+        cur = 0
+        # Create stages in the same order as the original model
+        for i in range(len(self.depths)):
+            in_ch = self.n_filters * (2 ** (len(self.depths) - 1 - i))
+            stage = nn.Sequential(
+                *[layer_type(dim=in_ch, drop_path=dp_rates[cur + j]) for j in range(self.depths[i])]
+            )
+            self.stages.append(stage)
+            cur += self.depths[i]
+        if not disable_last_norm:
+            self.norm = norm_type(in_ch, eps=layernorm_eps)
+        else:
+            self.norm = nn.Identity()
+        self.head = SConv1d(in_ch, self.channels, kernel_size=last_kernel_size, causal=self.causal, pad_mode=pad_mode, norm=norm, bias=bias)
+    def forward_features(self, x, cache=None, sample_indices=None, use_cache=False, debug=False):
+        for i in range(len(self.depths)):
+            # Apply upsampling
+            for layer in self.upsample_layers[i]:
+                if isinstance(layer, (SConv1d, SConvTranspose1d)):
+                    x = layer(x, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+                else:
+                    x = layer(x)
+            # Apply stage (Block1D contains Convlayer which contains SConv1d)
+            for block in self.stages[i]:
+                if hasattr(block, 'mixer') and hasattr(block.mixer, 'conv') and isinstance(block.mixer.conv, SConv1d):
+                    # Block1D forward with cache support
+                    residual = x
+                    x = block.norm(x)
+                    x = block.mixer.conv(x, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+                    if block.gamma is not None:
+                        x = x * block.gamma.unsqueeze(-1)
+                    x = residual + x
+                    # FFN part
+                    residual = x
+                    x = block.ffn_norm(x)
+                    x = x.permute(0, 2, 1)
+                    x = block.ffn(x)
+                    x = x.permute(0, 2, 1)
+                    if block.ffn_gamma is not None:
+                        x = x * block.ffn_gamma.unsqueeze(-1)
+                    x = residual + x
+                else:
+                    x = block(x)
+        return self.norm(x)
+    def forward(self, x, cache=None, sample_indices=None, use_cache=False, debug=False):
+        x = self.forward_features(x, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        x = self.head(x, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        return x
+@dataclass
+class KugelAudioTokenizerEncoderOutput:
+    """
+    Output of KugelAudio tokenizer encoder, representing a Gaussian distribution with fixed variance.
+    Args:
+        mean (`torch.FloatTensor`): The mean parameters of the distribution.
+        std (`float` or `torch.FloatTensor`): Fixed standard deviation value.
+    """
+    mean: torch.Tensor
+    std: Optional[Union[float, torch.Tensor]] = None
+    def sample(self, dist_type='fix'):
+        """
+        Sample from the distribution.
+        Args:
+            dist_type (`str`): Sampling method, either 'fix' or 'gaussian'.
+        Returns:
+            `torch.FloatTensor`: Sampled values.
+            `torch.FloatTensor` (optional): Standard deviation used (only when dist_type='gaussian').
+        """
+        if dist_type == 'fix':
+            x = self.mean + self.std * torch.randn_like(self.mean)
+            return x, self.std
+        elif dist_type == 'gaussian':
+            batch_size = self.mean.size(0)
+            value = self.std / 0.8
+            std = torch.randn(batch_size, device=self.mean.device, dtype=self.mean.dtype) * value
+            while std.dim() < self.mean.dim():
+                std = std.unsqueeze(-1)
+            x = self.mean + std * torch.randn_like(self.mean)
+            return x, std
+        else:
+            return self.mean, self.std
+    def kl(self):
+        """Compute KL divergence between this distribution and a standard normal."""
+        target = torch.zeros_like(self.mean)
+        return F.mse_loss(self.mean, target, reduction='none')
+    def mode(self):
+        """Return the distribution mode (which is the mean for Gaussian)."""
+        return self.mean
+class KugelAudioAcousticTokenizerModel(PreTrainedModel):
+    """KugelAudio speech tokenizer model combining encoder and decoder for acoustic tokens"""
+    config_class = KugelAudioAcousticTokenizerConfig
+    base_model_prefix = "kugelaudio_acoustic_tokenizer"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _no_split_modules = ["TokenizerEncoder", "TokenizerDecoder"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.register_buffer('fix_std', torch.tensor(config.fix_std), persistent=False)
+        self.std_dist_type = getattr(config, "std_dist_type", "fix")
+        # Parse encoder depths
+        if isinstance(config.encoder_depths, str):
+            encoder_depths = [int(d) for d in config.encoder_depths.split('-')]
+        else:
+            encoder_depths = config.encoder_depths
+        # Parse decoder depths if provided
+        if config.decoder_depths is not None and isinstance(config.decoder_depths, str):
+            decoder_depths = [int(d) for d in config.decoder_depths.split('-')]
+        else:
+            # Default: use reversed encoder depths if decoder_depths is None
+            decoder_depths = list(reversed(encoder_depths))
+        # Create encoder config
+        encoder_config = copy.deepcopy(config)
+        encoder_config.dimension = config.vae_dim
+        encoder_config.n_filters = config.encoder_n_filters
+        encoder_config.ratios = config.encoder_ratios
+        encoder_config.depths = encoder_depths
+        encoder_config.norm = config.conv_norm
+        encoder_config.pad_mode = config.pad_mode
+        encoder_config.bias = config.conv_bias
+        encoder_config.layernorm_eps = config.layernorm_eps
+        encoder_config.layernorm_elementwise_affine = config.layernorm_elementwise_affine
+        encoder_config.mixer_layer = config.mixer_layer
+        encoder_config.layer_scale_init_value = config.layer_scale_init_value
+        encoder_config.disable_last_norm = config.disable_last_norm
+        # Create decoder config
+        decoder_config = copy.deepcopy(config)
+        decoder_config.dimension = config.vae_dim
+        decoder_config.n_filters = config.decoder_n_filters
+        decoder_config.ratios = config.decoder_ratios
+        decoder_config.depths = decoder_depths
+        decoder_config.norm = config.conv_norm
+        decoder_config.pad_mode = config.pad_mode
+        decoder_config.bias = config.conv_bias
+        decoder_config.layernorm_eps = config.layernorm_eps
+        decoder_config.layernorm_elementwise_affine = config.layernorm_elementwise_affine
+        decoder_config.mixer_layer = config.mixer_layer
+        decoder_config.layer_scale_init_value = config.layer_scale_init_value
+        decoder_config.disable_last_norm = config.disable_last_norm
+        # Initialize encoder and decoder
+        self.encoder = TokenizerEncoder(encoder_config)
+        self.decoder = TokenizerDecoder(decoder_config)
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        """Initialize weights for the model"""
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=self.config.weight_init_value)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.ones_(module.weight)
+            nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.normal_(module.weight, std=self.config.weight_init_value)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    @torch.no_grad()
+    def encode(self, audio, cache=None, sample_indices=None, use_cache=False, debug=False):
+        """Convert audio to latent representations"""
+        latents = self.encoder(audio, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        return KugelAudioTokenizerEncoderOutput(mean=latents.permute(0, 2, 1), std=self.fix_std)
+    @torch.no_grad()
+    def sampling(self, encoder_output, dist_type=None):
+        """Sample from the encoder output distribution"""
+        dist_type = dist_type or self.std_dist_type
+        if dist_type == 'fix':
+            return encoder_output.sample(dist_type='fix')
+        elif dist_type == 'gaussian':
+            return encoder_output.sample(dist_type='gaussian')
+        else:
+            raise ValueError(f"Unsupported dist_type: {dist_type}, expected 'fix' or 'gaussian'")
+    @torch.no_grad()
+    def decode(self, latents, cache=None, sample_indices=None, use_cache=False, debug=False):
+        """Convert latent representations back to audio"""
+        if latents.shape[1] == self.config.vae_dim:
+            pass
+        else:
+            latents = latents.permute(0, 2, 1)
+        audio = self.decoder(latents, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        return audio
+    def forward(self, audio, cache=None, sample_indices=None, use_cache=False, debug=False):
+        """Full forward pass: encode audio to latents, then decode back to audio"""
+        encoder_output = self.encode(audio, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        sampled_latents, _ = self.sampling(encoder_output)
+        reconstructed = self.decode(sampled_latents, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        return reconstructed, sampled_latents
+class KugelAudioSemanticTokenizerModel(PreTrainedModel):
+    """KugelAudio speech tokenizer model with only encoder for semantic tokens"""
+    config_class = KugelAudioSemanticTokenizerConfig
+    base_model_prefix = "kugelaudio_semantic_tokenizer"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _no_split_modules = ["TokenizerEncoder"]
+    def __init__(self, config):
+        super().__init__(config)
+        # Parse encoder depths
+        if isinstance(config.encoder_depths, str):
+            encoder_depths = [int(d) for d in config.encoder_depths.split('-')]
+        else:
+            encoder_depths = config.encoder_depths
+        # Create encoder config
+        encoder_config = copy.deepcopy(config)
+        encoder_config.dimension = config.vae_dim
+        encoder_config.n_filters = config.encoder_n_filters
+        encoder_config.ratios = config.encoder_ratios
+        encoder_config.depths = encoder_depths
+        encoder_config.norm = config.conv_norm
+        encoder_config.pad_mode = config.pad_mode
+        encoder_config.bias = config.conv_bias
+        encoder_config.layernorm_eps = config.layernorm_eps
+        encoder_config.layernorm_elementwise_affine = config.layernorm_elementwise_affine
+        encoder_config.mixer_layer = config.mixer_layer
+        encoder_config.layer_scale_init_value = config.layer_scale_init_value
+        encoder_config.disable_last_norm = config.disable_last_norm
+        # Initialize encoder and decoder
+        self.encoder = TokenizerEncoder(encoder_config)
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        """Initialize weights for the model"""
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, std=self.config.weight_init_value)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.ones_(module.weight)
+            nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.normal_(module.weight, std=self.config.weight_init_value)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    @torch.no_grad()
+    def encode(self, audio, cache=None, sample_indices=None, use_cache=False, debug=False):
+        """Convert audio to latent representations"""
+        latents = self.encoder(audio, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        return KugelAudioTokenizerEncoderOutput(mean=latents.permute(0, 2, 1))
+    @torch.no_grad()
+    def sampling(self, encoder_output, dist_type=None):
+        """Sample from the encoder output distribution"""
+        return encoder_output.sample(dist_type='none')
+    def forward(self, audio, cache=None, sample_indices=None, use_cache=False, debug=False):
+        """Full forward pass: encode audio to latents, then decode back to audio"""
+        encoder_output = self.encode(audio, cache=cache, sample_indices=sample_indices, use_cache=use_cache, debug=debug)
+        sampled_latents, _ = self.sampling(encoder_output, dist_type='none')
+        return None, sampled_latents
+AutoModel.register(KugelAudioAcousticTokenizerConfig, KugelAudioAcousticTokenizerModel)
+AutoModel.register(KugelAudioSemanticTokenizerConfig, KugelAudioSemanticTokenizerModel)
+__all__ = [
+    "KugelAudioTokenizerStreamingCache",
+    "KugelAudioAcousticTokenizerModel",
+    "KugelAudioSemanticTokenizerModel",
+]

kugelaudio_open/processors/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Processors for KugelAudio text and audio handling."""
+from kugelaudio_open.processors.audio_processor import AudioProcessor, AudioNormalizer
+from kugelaudio_open.processors.kugelaudio_processor import KugelAudioProcessor
+__all__ = [
+    "AudioProcessor",
+    "AudioNormalizer",
+    "KugelAudioProcessor",
+]

kugelaudio_open/processors/audio_processor.py ADDED Viewed

	@@ -0,0 +1,268 @@

+"""Audio processing utilities for KugelAudio."""
+import os
+from typing import Optional, Union, List, Dict, Any
+import numpy as np
+import torch
+from transformers.feature_extraction_utils import FeatureExtractionMixin
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class AudioNormalizer:
+    """Normalize audio to target dB FS level.
+    This ensures consistent input levels for the model while
+    maintaining audio quality and avoiding clipping.
+    """
+    def __init__(self, target_dB_FS: float = -25, eps: float = 1e-6):
+        self.target_dB_FS = target_dB_FS
+        self.eps = eps
+    def normalize_db(self, audio: np.ndarray) -> tuple:
+        """Adjust audio to target dB FS level."""
+        rms = np.sqrt(np.mean(audio**2))
+        scalar = 10 ** (self.target_dB_FS / 20) / (rms + self.eps)
+        return audio * scalar, rms, scalar
+    def avoid_clipping(self, audio: np.ndarray) -> tuple:
+        """Scale down if necessary to avoid clipping."""
+        max_val = np.max(np.abs(audio))
+        if max_val > 1.0:
+            scalar = max_val + self.eps
+            return audio / scalar, scalar
+        return audio, 1.0
+    def __call__(self, audio: np.ndarray) -> np.ndarray:
+        """Normalize audio: adjust dB FS then avoid clipping."""
+        audio, _, _ = self.normalize_db(audio)
+        audio, _ = self.avoid_clipping(audio)
+        return audio
+class AudioProcessor(FeatureExtractionMixin):
+    """Processor for audio preprocessing and postprocessing.
+    Handles:
+    - Audio format conversion (stereo to mono)
+    - Normalization
+    - Loading from various file formats
+    - Saving to WAV files
+    Example:
+        >>> processor = AudioProcessor(sampling_rate=24000)
+        >>> audio = processor("path/to/audio.wav")
+        >>> processor.save_audio(generated_audio, "output.wav")
+    """
+    model_input_names = ["input_features"]
+    def __init__(
+        self,
+        sampling_rate: int = 24000,
+        normalize_audio: bool = True,
+        target_dB_FS: float = -25,
+        eps: float = 1e-6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.sampling_rate = sampling_rate
+        self.normalize_audio = normalize_audio
+        self.normalizer = AudioNormalizer(target_dB_FS, eps) if normalize_audio else None
+        self.feature_extractor_dict = {
+            "sampling_rate": sampling_rate,
+            "normalize_audio": normalize_audio,
+            "target_dB_FS": target_dB_FS,
+            "eps": eps,
+        }
+    def _ensure_mono(self, audio: np.ndarray) -> np.ndarray:
+        """Convert stereo to mono if needed."""
+        if len(audio.shape) == 1:
+            return audio
+        elif len(audio.shape) == 2:
+            if audio.shape[0] == 2:
+                return np.mean(audio, axis=0)
+            elif audio.shape[1] == 2:
+                return np.mean(audio, axis=1)
+            elif audio.shape[0] == 1:
+                return audio.squeeze(0)
+            elif audio.shape[1] == 1:
+                return audio.squeeze(1)
+            else:
+                raise ValueError(f"Unexpected audio shape: {audio.shape}")
+        else:
+            raise ValueError(f"Audio should be 1D or 2D, got shape: {audio.shape}")
+    def _process_single(self, audio: Union[np.ndarray, List[float]]) -> np.ndarray:
+        """Process a single audio array."""
+        if not isinstance(audio, np.ndarray):
+            audio = np.array(audio, dtype=np.float32)
+        else:
+            audio = audio.astype(np.float32)
+        audio = self._ensure_mono(audio)
+        if self.normalize_audio and self.normalizer:
+            audio = self.normalizer(audio)
+        return audio
+    def _load_from_path(self, audio_path: str) -> np.ndarray:
+        """Load audio from file path."""
+        ext = os.path.splitext(audio_path)[1].lower()
+        if ext in [".wav", ".mp3", ".flac", ".m4a", ".ogg"]:
+            import librosa
+            audio, _ = librosa.load(audio_path, sr=self.sampling_rate, mono=True)
+            return audio
+        elif ext == ".pt":
+            tensor = torch.load(audio_path, map_location="cpu", weights_only=True).squeeze()
+            return tensor.numpy().astype(np.float32)
+        elif ext == ".npy":
+            return np.load(audio_path).astype(np.float32)
+        else:
+            raise ValueError(f"Unsupported format: {ext}")
+    def __call__(
+        self,
+        audio: Union[str, np.ndarray, List[float], List[np.ndarray], List[str]] = None,
+        sampling_rate: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """Process audio input(s).
+        Args:
+            audio: Audio input - path, array, or list of either
+            sampling_rate: Input sampling rate (for validation)
+            return_tensors: Return format ("pt" for PyTorch, "np" for NumPy)
+        Returns:
+            Dictionary with processed audio
+        """
+        if audio is None:
+            raise ValueError("Audio input is required")
+        if sampling_rate is not None and sampling_rate != self.sampling_rate:
+            logger.warning(
+                f"Input sampling rate ({sampling_rate}) differs from expected ({self.sampling_rate}). "
+                "Please resample your audio."
+            )
+        # Handle different input types
+        if isinstance(audio, str):
+            audio = self._load_from_path(audio)
+            is_batched = False
+        elif isinstance(audio, list):
+            if all(isinstance(item, str) for item in audio):
+                audio = [self._load_from_path(p) for p in audio]
+                is_batched = True
+            else:
+                is_batched = isinstance(audio[0], (np.ndarray, list))
+        else:
+            is_batched = False
+        # Process
+        if is_batched:
+            processed = [self._process_single(a) for a in audio]
+        else:
+            processed = [self._process_single(audio)]
+        # Convert to tensors
+        if return_tensors == "pt":
+            if len(processed) == 1:
+                features = torch.from_numpy(processed[0]).unsqueeze(0).unsqueeze(1)
+            else:
+                features = torch.stack([torch.from_numpy(a) for a in processed]).unsqueeze(1)
+        elif return_tensors == "np":
+            if len(processed) == 1:
+                features = processed[0][np.newaxis, np.newaxis, :]
+            else:
+                features = np.stack(processed)[:, np.newaxis, :]
+        else:
+            features = processed[0] if len(processed) == 1 else processed
+        return {"audio": features}
+    def save_audio(
+        self,
+        audio: Union[torch.Tensor, np.ndarray, List],
+        output_path: str = "output.wav",
+        sampling_rate: Optional[int] = None,
+        normalize: bool = False,
+        batch_prefix: str = "audio_",
+    ) -> List[str]:
+        """Save audio to WAV file(s).
+        Args:
+            audio: Audio data to save
+            output_path: Output path (directory for batched audio)
+            sampling_rate: Sampling rate (defaults to processor's rate)
+            normalize: Whether to normalize before saving
+            batch_prefix: Prefix for batch files
+        Returns:
+            List of saved file paths
+        """
+        import soundfile as sf
+        if sampling_rate is None:
+            sampling_rate = self.sampling_rate
+        # Convert to numpy
+        if isinstance(audio, torch.Tensor):
+            audio_np = audio.float().detach().cpu().numpy()
+        elif isinstance(audio, list):
+            if all(isinstance(a, torch.Tensor) for a in audio):
+                audio_np = [a.float().detach().cpu().numpy() for a in audio]
+            else:
+                audio_np = audio
+        else:
+            audio_np = audio
+        saved_paths = []
+        if isinstance(audio_np, list):
+            os.makedirs(output_path, exist_ok=True)
+            for i, item in enumerate(audio_np):
+                item = self._prepare_for_save(item, normalize)
+                path = os.path.join(output_path, f"{batch_prefix}{i}.wav")
+                sf.write(path, item, sampling_rate)
+                saved_paths.append(path)
+        elif len(audio_np.shape) >= 3 and audio_np.shape[0] > 1:
+            os.makedirs(output_path, exist_ok=True)
+            for i in range(audio_np.shape[0]):
+                item = audio_np[i].squeeze()
+                item = self._prepare_for_save(item, normalize)
+                path = os.path.join(output_path, f"{batch_prefix}{i}.wav")
+                sf.write(path, item, sampling_rate)
+                saved_paths.append(path)
+        else:
+            item = self._prepare_for_save(audio_np.squeeze(), normalize)
+            sf.write(output_path, item, sampling_rate)
+            saved_paths.append(output_path)
+        return saved_paths
+    def _prepare_for_save(self, audio: np.ndarray, normalize: bool) -> np.ndarray:
+        """Prepare audio for saving."""
+        if len(audio.shape) > 1 and audio.shape[0] == 1:
+            audio = audio.squeeze(0)
+        if normalize:
+            max_val = np.abs(audio).max()
+            if max_val > 0:
+                audio = audio / max_val
+        return audio
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return self.feature_extractor_dict

kugelaudio_open/processors/kugelaudio_processor.py ADDED Viewed

	@@ -0,0 +1,366 @@

+"""Main processor for KugelAudio combining text and audio processing."""
+import json
+import math
+import os
+from typing import Any, Dict, List, Optional, Union
+import numpy as np
+import torch
+from kugelaudio_open.processors.audio_processor import AudioNormalizer, AudioProcessor
+from transformers.tokenization_utils_base import (
+    BatchEncoding,
+    PaddingStrategy,
+    TruncationStrategy,
+)
+from transformers.utils import TensorType, cached_file, logging
+logger = logging.get_logger(__name__)
+class KugelAudioProcessor:
+    """Combined processor for KugelAudio text and audio.
+    Wraps a text tokenizer and audio processor into a single interface
+    for preparing inputs for KugelAudio models.
+    Example:
+        >>> processor = KugelAudioProcessor.from_pretrained("kugelaudio/kugelaudio-0-open")
+        >>> inputs = processor(text="Hello world", voice_prompt=voice_audio)
+    """
+    def __init__(
+        self,
+        tokenizer=None,
+        audio_processor: Optional[AudioProcessor] = None,
+        speech_compression_ratio: int = 3200,
+        db_normalize: bool = True,
+        **kwargs,
+    ):
+        self.tokenizer = tokenizer
+        self.audio_processor = audio_processor or AudioProcessor()
+        self.speech_compression_ratio = speech_compression_ratio
+        self.db_normalize = db_normalize
+        self.audio_normalizer = AudioNormalizer() if db_normalize else None
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
+        """Load processor from pretrained model.
+        Args:
+            pretrained_model_name_or_path: Model ID or local path
+        Returns:
+            KugelAudioProcessor instance
+        """
+        from kugelaudio_open.processors.text_tokenizer import KugelAudioTextTokenizer
+        # Try to load config
+        config_path = os.path.join(pretrained_model_name_or_path, "preprocessor_config.json")
+        config = None
+        if os.path.exists(config_path):
+            with open(config_path, "r") as f:
+                config = json.load(f)
+        else:
+            try:
+                config_file = cached_file(
+                    pretrained_model_name_or_path, "preprocessor_config.json", **kwargs
+                )
+                with open(config_file, "r") as f:
+                    config = json.load(f)
+            except Exception as e:
+                logger.warning(f"Could not load config: {e}. Using defaults.")
+                config = {
+                    "speech_compression_ratio": 3200,
+                    "db_normalize": True,
+                }
+        # Extract parameters
+        speech_compression_ratio = config.get("speech_compression_ratio", 3200)
+        db_normalize = config.get("db_normalize", True)
+        # Load tokenizer
+        lm_name = config.get("language_model_pretrained_name") or kwargs.pop(
+            "language_model_pretrained_name", "Qwen/Qwen2.5-1.5B"
+        )
+        logger.info(f"Loading tokenizer from {lm_name}")
+        tokenizer = KugelAudioTextTokenizer.from_pretrained(lm_name, **kwargs)
+        # Load audio processor
+        if "audio_processor" in config:
+            audio_config = config["audio_processor"]
+            audio_processor = AudioProcessor(
+                sampling_rate=audio_config.get("sampling_rate", 24000),
+                normalize_audio=audio_config.get("normalize_audio", True),
+                target_dB_FS=audio_config.get("target_dB_FS", -25),
+            )
+        else:
+            audio_processor = AudioProcessor()
+        return cls(
+            tokenizer=tokenizer,
+            audio_processor=audio_processor,
+            speech_compression_ratio=speech_compression_ratio,
+            db_normalize=db_normalize,
+        )
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
+        """Save processor to directory."""
+        os.makedirs(save_directory, exist_ok=True)
+        config = {
+            "processor_class": "KugelAudioProcessor",
+            "speech_compression_ratio": self.speech_compression_ratio,
+            "db_normalize": self.db_normalize,
+            "audio_processor": {
+                "feature_extractor_type": "AudioProcessor",
+                "sampling_rate": getattr(self.audio_processor, "sampling_rate", 24000),
+                "normalize_audio": getattr(self.audio_processor, "normalize_audio", True),
+                "target_dB_FS": getattr(self.audio_processor, "target_dB_FS", -25),
+            },
+        }
+        config_path = os.path.join(save_directory, "preprocessor_config.json")
+        with open(config_path, "w") as f:
+            json.dump(config, f, indent=2)
+        logger.info(f"Processor saved to {config_path}")
+    def __call__(
+        self,
+        text: Optional[str] = None,
+        voice_prompt: Optional[Union[np.ndarray, torch.Tensor, str]] = None,
+        padding: Union[bool, str, PaddingStrategy] = True,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """Process text and optional voice prompt.
+        Args:
+            text: Input text to synthesize
+            voice_prompt: Voice prompt audio for speaker identity (raw audio tensor or path)
+            padding: Padding strategy
+            truncation: Truncation strategy
+            max_length: Maximum sequence length
+            return_tensors: Return format
+        Returns:
+            BatchEncoding with processed inputs including speech_input_mask for voice cloning
+        """
+        if text is None:
+            raise ValueError("Text input is required")
+        # Special token IDs
+        speech_start_id = 151652  # <|vision_start|> repurposed for speech
+        speech_diffusion_id = 151654  # VAE token used as placeholder
+        # Format text with proper template
+        # Add speaker prefix if not present (use Speaker 0 to match training format)
+        formatted_text = text.strip()
+        if not formatted_text.startswith("Speaker"):
+            formatted_text = f"Speaker 0: {formatted_text}"
+        # Build the full prompt template matching the training format
+        system_prompt = " Transform the text provided by various speakers into speech output, utilizing the distinct voice of each respective speaker.\n"
+        # Start building tokens and speech_input_mask
+        full_tokens = []
+        speech_input_mask = []
+        voice_audio = None
+        # System prompt tokens
+        system_tokens = self.tokenizer.encode(system_prompt, add_special_tokens=False)
+        full_tokens.extend(system_tokens)
+        speech_input_mask.extend([False] * len(system_tokens))
+        # Process voice prompt if provided
+        if voice_prompt is not None:
+            # Load audio if it's a path
+            if isinstance(voice_prompt, str):
+                voice_audio = self.audio_processor._load_from_path(voice_prompt)
+                if self.db_normalize and self.audio_normalizer:
+                    voice_audio = self.audio_normalizer(voice_audio)
+            elif isinstance(voice_prompt, np.ndarray):
+                voice_audio = voice_prompt.astype(np.float32)
+            elif isinstance(voice_prompt, torch.Tensor):
+                voice_audio = voice_prompt.cpu().numpy()
+                if voice_audio.ndim > 1:
+                    voice_audio = voice_audio.squeeze()
+                voice_audio = voice_audio.astype(np.float32)
+            # Voice input section with placeholder tokens
+            voice_input_tokens = self.tokenizer.encode(" Voice input:\n", add_special_tokens=False)
+            full_tokens.extend(voice_input_tokens)
+            speech_input_mask.extend([False] * len(voice_input_tokens))
+            # Speaker prefix for voice
+            speaker_prefix = self.tokenizer.encode(" Speaker 0:", add_special_tokens=False)
+            full_tokens.extend(speaker_prefix)
+            speech_input_mask.extend([False] * len(speaker_prefix))
+            # Calculate number of VAE tokens needed based on audio length
+            # compression ratio is typically 3200 samples per token at 24kHz
+            num_voice_tokens = math.ceil(len(voice_audio) / self.speech_compression_ratio)
+            # Add placeholder VAE tokens that will be replaced with speech embeddings
+            full_tokens.extend([speech_diffusion_id] * num_voice_tokens)
+            speech_input_mask.extend([True] * num_voice_tokens)  # These positions get speech embeddings
+            # Newline after voice
+            newline_tokens = self.tokenizer.encode("\n", add_special_tokens=False)
+            full_tokens.extend(newline_tokens)
+            speech_input_mask.extend([False] * len(newline_tokens))
+        # Text input section
+        text_input_tokens = self.tokenizer.encode(" Text input:\n", add_special_tokens=False)
+        full_tokens.extend(text_input_tokens)
+        speech_input_mask.extend([False] * len(text_input_tokens))
+        # Speaker text
+        speaker_text_tokens = self.tokenizer.encode(f" {formatted_text}\n", add_special_tokens=False)
+        full_tokens.extend(speaker_text_tokens)
+        speech_input_mask.extend([False] * len(speaker_text_tokens))
+        # Speech output section
+        speech_output_tokens = self.tokenizer.encode(" Speech output:\n", add_special_tokens=False)
+        full_tokens.extend(speech_output_tokens)
+        speech_input_mask.extend([False] * len(speech_output_tokens))
+        # Add speech_start token
+        full_tokens.append(speech_start_id)
+        speech_input_mask.append(False)
+        result = BatchEncoding()
+        result["text_ids"] = full_tokens
+        result["speech_input_mask"] = speech_input_mask
+        if return_tensors == "pt":
+            result["text_ids"] = torch.tensor([full_tokens], dtype=torch.long)
+            result["speech_input_mask"] = torch.tensor([speech_input_mask], dtype=torch.bool)
+        # Include processed voice audio for the model to encode
+        if voice_audio is not None:
+            if return_tensors == "pt":
+                result["speech_tensors"] = torch.tensor(voice_audio, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
+                # Create speech_masks (all True for the voice frames)
+                num_frames = math.ceil(len(voice_audio) / self.speech_compression_ratio)
+                result["speech_masks"] = torch.ones(1, num_frames, dtype=torch.bool)
+            else:
+                result["speech_tensors"] = voice_audio
+                num_frames = math.ceil(len(voice_audio) / self.speech_compression_ratio)
+                result["speech_masks"] = [True] * num_frames
+        return result
+    def process_with_cached_prompt(
+        self,
+        text: str,
+        cached_prompt: Dict[str, Any],
+        return_tensors: Optional[Union[str, TensorType]] = "pt",
+        **kwargs,
+    ) -> BatchEncoding:
+        """Process text with pre-computed voice prompt cache.
+        Args:
+            text: Input text to synthesize
+            cached_prompt: Pre-computed KV cache from voice prompt
+            return_tensors: Return format
+        Returns:
+            BatchEncoding ready for generation
+        """
+        script_tokens = self.tokenizer.encode(text.strip() + "\n", add_special_tokens=False)
+        lm_length = cached_prompt["lm"]["last_hidden_state"].size(1)
+        tts_lm_length = cached_prompt["tts_lm"]["last_hidden_state"].size(1)
+        # Create pseudo input IDs
+        input_ids = [self.tokenizer.pad_id] * lm_length
+        tts_lm_input_ids = [self.tokenizer.pad_id] * tts_lm_length
+        speech_input_mask = [False] * tts_lm_length
+        result = BatchEncoding()
+        if return_tensors == "pt":
+            result["input_ids"] = torch.tensor([input_ids], dtype=torch.long)
+            result["tts_lm_input_ids"] = torch.tensor([tts_lm_input_ids], dtype=torch.long)
+            result["tts_text_ids"] = torch.tensor([script_tokens], dtype=torch.long)
+            result["attention_mask"] = torch.ones(1, lm_length, dtype=torch.long)
+            result["tts_lm_attention_mask"] = torch.ones(1, tts_lm_length, dtype=torch.long)
+            result["speech_input_mask"] = torch.tensor([speech_input_mask], dtype=torch.bool)
+        else:
+            result["input_ids"] = [input_ids]
+            result["tts_lm_input_ids"] = [tts_lm_input_ids]
+            result["tts_text_ids"] = [script_tokens]
+            result["attention_mask"] = [[1] * lm_length]
+            result["tts_lm_attention_mask"] = [[1] * tts_lm_length]
+            result["speech_input_mask"] = [speech_input_mask]
+        return result
+    def prepare_speech_inputs(
+        self,
+        speech_inputs: List[np.ndarray],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> Dict[str, Any]:
+        """Prepare speech inputs for model.
+        Args:
+            speech_inputs: List of speech arrays
+            return_tensors: Return format
+            device: Device to place tensors
+            dtype: Data type for tensors
+        Returns:
+            Dictionary with padded speeches and masks
+        """
+        if not speech_inputs:
+            return {"padded_speeches": None, "speech_masks": None}
+        # Calculate sequence lengths
+        seq_lens = [math.ceil(s.shape[0] / self.speech_compression_ratio) for s in speech_inputs]
+        max_speech_len = max(s.shape[0] for s in speech_inputs)
+        # Pad speeches
+        padded = np.zeros((len(speech_inputs), max_speech_len), dtype=np.float32)
+        masks = np.zeros((len(speech_inputs), max(seq_lens)), dtype=np.bool_)
+        for i, (speech, seq_len) in enumerate(zip(speech_inputs, seq_lens)):
+            padded[i, : len(speech)] = speech
+            masks[i, :seq_len] = True
+        result = {"padded_speeches": padded, "speech_masks": masks}
+        if return_tensors == "pt":
+            result["padded_speeches"] = torch.tensor(
+                padded, device=device, dtype=dtype or torch.float32
+            )
+            result["speech_masks"] = torch.tensor(masks, device=device, dtype=torch.bool)
+        return result
+    def batch_decode(self, *args, **kwargs):
+        """Decode token IDs to text."""
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """Decode token IDs to text."""
+        return self.tokenizer.decode(*args, **kwargs)
+    def save_audio(self, audio, output_path: str = "output.wav", **kwargs) -> List[str]:
+        """Save generated audio to file."""
+        return self.audio_processor.save_audio(audio, output_path, **kwargs)
+    @property
+    def model_input_names(self) -> List[str]:
+        """Return list of model input names."""
+        tokenizer_names = getattr(self.tokenizer, "model_input_names", [])
+        audio_names = getattr(self.audio_processor, "model_input_names", [])
+        return list(
+            dict.fromkeys(tokenizer_names + audio_names + ["speech_inputs", "speech_input_mask"])
+        )

kugelaudio_open/processors/text_tokenizer.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""Text tokenizer for KugelAudio based on Qwen2."""
+from typing import List, Optional
+from transformers.utils import logging
+from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast
+logger = logging.get_logger(__name__)
+class KugelAudioTextTokenizer(Qwen2TokenizerFast):
+    """Text tokenizer for KugelAudio with speech special tokens.
+    Based on Qwen2 tokenizer with additional tokens for speech synthesis:
+    - speech_start: Marks the beginning of speech generation
+    - speech_end: Marks the end of speech generation
+    - speech_diffusion: Placeholder for diffusion tokens
+    Example:
+        >>> tokenizer = KugelAudioTextTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B")
+        >>> tokens = tokenizer.encode("Hello world")
+    """
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token=None,
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        add_prefix_space=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+        self._add_speech_special_tokens()
+    def _add_speech_special_tokens(self):
+        """Add KugelAudio-specific special tokens for speech."""
+        special_tokens = {
+            "additional_special_tokens": [
+                "<|vision_start|>",  # Speech start (reusing vision tokens for compatibility)
+                "<|vision_end|>",    # Speech end
+                "<|vision_pad|>",    # Speech diffusion pad
+            ]
+        }
+        self.add_special_tokens(special_tokens)
+        # Cache special token IDs
+        self._speech_start_id = self.convert_tokens_to_ids("<|vision_start|>")
+        self._speech_end_id = self.convert_tokens_to_ids("<|vision_end|>")
+        self._speech_diffusion_id = self.convert_tokens_to_ids("<|vision_pad|>")
+        self._eos_id = self.eos_token_id
+        self._pad_id = self.convert_tokens_to_ids("<|image_pad|>")
+    @property
+    def eos_id(self) -> int:
+        """End of sequence token ID."""
+        return self._eos_id
+    @property
+    def speech_start_id(self) -> int:
+        """Speech start token ID."""
+        return self._speech_start_id
+    @property
+    def speech_end_id(self) -> int:
+        """Speech end token ID."""
+        return self._speech_end_id
+    @property
+    def speech_diffusion_id(self) -> int:
+        """Speech diffusion placeholder token ID."""
+        return self._speech_diffusion_id
+    @property
+    def pad_id(self) -> int:
+        """Padding token ID (returns -100 for loss masking)."""
+        return self._pad_id

kugelaudio_open/schedule/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""KugelAudio scheduling components."""
+from .dpm_solver import DPMSolverMultistepScheduler
+__all__ = ["DPMSolverMultistepScheduler"]

kugelaudio_open/schedule/dpm_solver.py ADDED Viewed

	@@ -0,0 +1,1084 @@

+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+import math
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import deprecate
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+            # return math.cos(t * math.pi / 2 * 0.95) ** 2
+    elif alpha_transform_type == "exp":
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+    elif alpha_transform_type == "cauchy":
+        # µ + γ tan (π (0.5 - x))  γ = 1, µ = 3
+        # alpha^2 = 1-1/(exp(λ)+1)
+        def alpha_bar_fn(t, gamma=1, mu=3):
+            snr = mu + gamma * math.tan(math.pi * (0.5 - t) * 0.9)
+            return 1 - 1 / (math.exp(snr) + 1.1)
+    elif alpha_transform_type == "laplace":
+        # µ − bsgn(0.5 − t) log(1 − 2|t − 0.5|) µ = 0, b = 1
+        def alpha_bar_fn(t, mu=0, b=1):
+            snr = mu - b * math.copysign(1, 0.5 - t) * math.log(1 - 2 * abs(t - 0.5) * 0.98)
+            return 1 - 1 / (math.exp(snr) + 1.02)
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+    return betas
+class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DPMSolverMultistepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
+            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
+            paper, and the `dpmsolver++` type implements the algorithms in the
+            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_lu_lambdas (`bool`, *optional*, defaults to `False`):
+            Whether to use the uniform-logSNR for step sizes proposed by Lu's DPM-Solver in the noise schedule during
+            the sampling process. If `True`, the sigmas and time steps are determined according to a sequence of
+            `lambda(t)`.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        use_karras_sigmas: Optional[bool] = False,
+        use_lu_lambdas: Optional[bool] = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
+            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0", deprecation_message)
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2" or beta_schedule == "cosine":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cosine")
+        elif beta_schedule == "cauchy":
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cauchy")
+        elif beta_schedule == "laplace":
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="laplace")
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}")
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero":
+            raise ValueError(
+                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead."
+            )
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary timesteps schedule. If `None`, timesteps will be generated
+                based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
+                must be `None`, and `timestep_spacing` attribute will be ignored.
+        """
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Must pass exactly one of `num_inference_steps` or `timesteps`.")
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+        if timesteps is not None and self.config.use_karras_sigmas:
+            raise ValueError("Cannot use `timesteps` with `config.use_karras_sigmas = True`")
+        if timesteps is not None and self.config.use_lu_lambdas:
+            raise ValueError("Cannot use `timesteps` with `config.use_lu_lambdas = True`")
+        if timesteps is not None:
+            timesteps = np.array(timesteps).astype(np.int64)
+        else:
+            # Clipping the minimum of all lambda(t) for numerical stability.
+            # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+            clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+            last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
+            # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+            if self.config.timestep_spacing == "linspace":
+                timesteps = (
+                    np.linspace(0, last_timestep - 1, num_inference_steps + 1)
+                    .round()[::-1][:-1]
+                    .copy()
+                    .astype(np.int64)
+                )
+            elif self.config.timestep_spacing == "leading":
+                step_ratio = last_timestep // (num_inference_steps + 1)
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = (
+                    (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+                )
+                timesteps += self.config.steps_offset
+            elif self.config.timestep_spacing == "trailing":
+                step_ratio = self.config.num_train_timesteps / num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
+                timesteps -= 1
+            else:
+                raise ValueError(
+                    f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                )
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        if self.config.use_karras_sigmas:
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        elif self.config.use_lu_lambdas:
+            lambdas = np.flip(log_sigmas.copy())
+            lambdas = self._convert_to_lu(in_lambdas=lambdas, num_inference_steps=num_inference_steps)
+            sigmas = np.exp(lambdas)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+        sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+        self.num_inference_steps = len(timesteps)
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+        return sample
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+        return alpha_t, sigma_t
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+    def _convert_to_lu(self, in_lambdas: torch.Tensor, num_inference_steps) -> torch.Tensor:
+        """Constructs the noise schedule of Lu et al. (2022)."""
+        lambda_min: float = in_lambdas[-1].item()
+        lambda_max: float = in_lambdas[0].item()
+        rho = 1.0  # 1.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = lambda_min ** (1 / rho)
+        max_inv_rho = lambda_max ** (1 / rho)
+        lambdas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return lambdas
+    def convert_model_output(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+        <Tip>
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+        </Tip>
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        # Guard against out-of-bounds access (can occur in concurrent scenarios)
+        safe_step_index = min(self.step_index, len(self.sigmas) - 1)
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                sigma = self.sigmas[safe_step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[safe_step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+            return x0_pred
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                sigma = self.sigmas[safe_step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[safe_step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+            if self.config.thresholding:
+                sigma = self.sigmas[safe_step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+            return epsilon
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        # Guard against out-of-bounds access (can occur in concurrent scenarios)
+        current_index = min(self.step_index, len(self.sigmas) - 1)
+        next_index = min(self.step_index + 1, len(self.sigmas) - 1)
+        sigma_t, sigma_s = self.sigmas[next_index], self.sigmas[current_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = (
+                (sigma_t / sigma_s * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            x_t = (
+                (alpha_t / alpha_s) * sample
+                - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * model_output
+                + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+            )
+        return x_t
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        noise: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the second-order multistep DPMSolver.
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        # Guard against out-of-bounds access (can occur in concurrent scenarios)
+        current_index = min(self.step_index, len(self.sigmas) - 1)
+        next_index = min(self.step_index + 1, len(self.sigmas) - 1)
+        prev_index = max(self.step_index - 1, 0)
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[next_index],
+            self.sigmas[current_index],
+            self.sigmas[prev_index],
+        )
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 2.0 * (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+        return x_t
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.Tensor],
+        *args,
+        sample: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the third-order multistep DPMSolver.
+        Args:
+            model_output_list (`List[torch.Tensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        # Guard against out-of-bounds access (can occur in concurrent scenarios)
+        current_index = min(self.step_index, len(self.sigmas) - 1)
+        next_index = min(self.step_index + 1, len(self.sigmas) - 1)
+        prev_index_1 = max(self.step_index - 1, 0)
+        prev_index_2 = max(self.step_index - 2, 0)
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[next_index],
+            self.sigmas[current_index],
+            self.sigmas[prev_index_1],
+            self.sigmas[prev_index_2],
+        )
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+        return step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`LEdits++`].
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Improve numerical stability for small number of steps
+        # Also guard against out-of-bounds access: if step_index >= len(timesteps) - 1,
+        # we must use first-order to avoid accessing sigmas[step_index + 1] out of bounds
+        is_last_or_past = self.step_index >= len(self.timesteps) - 1
+        lower_order_final = is_last_or_past and (
+            self.config.euler_at_final
+            or (self.config.lower_order_final and len(self.timesteps) < 15)
+            or self.config.final_sigmas_type == "zero"
+            or self.step_index >= len(self.sigmas) - 1  # Safety: prevent OOB access
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=torch.float32
+            )
+        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = variance_noise.to(device=model_output.device, dtype=torch.float32)
+        else:
+            noise = None
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
+        else:
+            prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample)
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+        # Cast sample back to expected dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        # alpha_t = self.alpha_t.to(device=original_samples.device, dtype=original_samples.dtype)
+        # sigma_t = self.sigma_t.to(device=original_samples.device, dtype=original_samples.dtype)
+        alpha_t = self.alpha_t.to(original_samples.device).to(original_samples.dtype)
+        sigma_t = self.sigma_t.to(original_samples.device).to(original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+        alpha_t = alpha_t[timesteps].flatten()
+        while len(alpha_t.shape) < len(original_samples.shape):
+            alpha_t = alpha_t.unsqueeze(-1)
+        sigma_t = sigma_t[timesteps].flatten()
+        while len(sigma_t.shape) < len(original_samples.shape):
+            sigma_t = sigma_t.unsqueeze(-1)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+    def get_velocity(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # alpha_t = self.alpha_t.to(device=original_samples.device, dtype=original_samples.dtype)
+        # sigma_t = self.sigma_t.to(device=original_samples.device, dtype=original_samples.dtype)
+        alpha_t = self.alpha_t.to(original_samples.device).to(original_samples.dtype)
+        sigma_t = self.sigma_t.to(original_samples.device).to(original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+        alpha_t = alpha_t[timesteps].flatten()
+        while len(alpha_t.shape) < len(original_samples.shape):
+            alpha_t = alpha_t.unsqueeze(-1)
+        sigma_t = sigma_t[timesteps].flatten()
+        while len(sigma_t.shape) < len(original_samples.shape):
+            sigma_t = sigma_t.unsqueeze(-1)
+        velocity = alpha_t * noise - sigma_t * original_samples
+        return velocity
+    def __len__(self):
+        return self.config.num_train_timesteps

kugelaudio_open/ui/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Gradio web interface for KugelAudio."""
+from kugelaudio_open.ui.app import create_app, launch_app
+__all__ = ["create_app", "launch_app"]

kugelaudio_open/ui/__main__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""CLI entry point for KugelAudio UI."""
+import argparse
+from kugelaudio_open.ui import launch_app
+def main():
+    parser = argparse.ArgumentParser(description="Launch KugelAudio Gradio UI")
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a public Gradio share link",
+    )
+    parser.add_argument(
+        "--host",
+        default="127.0.0.1",
+        help="Server hostname (default: 127.0.0.1, use 0.0.0.0 for network access)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Server port (default: 7860)",
+    )
+    args = parser.parse_args()
+    print(f"🎙️ Starting KugelAudio UI on {args.host}:{args.port}")
+    if args.share:
+        print("📡 Creating public share link...")
+    launch_app(
+        share=args.share,
+        server_name=args.host,
+        server_port=args.port,
+    )
+if __name__ == "__main__":
+    main()

kugelaudio_open/ui/app.py ADDED Viewed

	@@ -0,0 +1,506 @@

+"""Gradio web interface for KugelAudio text-to-speech."""
+import os
+import tempfile
+import warnings
+from typing import Optional, Tuple
+import numpy as np
+import torch
+try:
+    import gradio as gr
+    GRADIO_AVAILABLE = True
+except ImportError:
+    GRADIO_AVAILABLE = False
+    warnings.warn("Gradio not installed. Install with: pip install gradio")
+# Global model instances (lazy loaded)
+_model = None
+_processor = None
+_watermark = None
+_current_model_id = None  # Track which model is loaded
+def get_device():
+    """Get the best available device."""
+    if torch.cuda.is_available():
+        return "cuda"
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+def _warmup_model(model, processor=None):
+    """Warmup model components to eliminate CUDA kernel compilation overhead on first generation.
+    This runs dummy data through all model components (acoustic decoder, semantic encoder,
+    diffusion head, language model) to trigger JIT compilation before actual inference.
+    """
+    device = next(model.parameters()).device
+    dtype = next(model.parameters()).dtype
+    with torch.no_grad():
+        # 1. Warmup acoustic decoder (biggest impact - saves ~190ms on first call)
+        latent_dim = model.config.acoustic_vae_dim
+        dummy_latent = torch.randn(1, latent_dim, 1, device=device, dtype=dtype)
+        _ = model.acoustic_tokenizer.decode(dummy_latent)
+        # 2. Warmup semantic encoder
+        dummy_audio = torch.randn(1, 1, 3200, device=device, dtype=dtype)
+        _ = model.semantic_tokenizer.encode(dummy_audio)
+        # 3. Warmup diffusion/prediction head
+        hidden_size = model.config.decoder_config.hidden_size
+        model.noise_scheduler.set_timesteps(model.ddpm_inference_steps)
+        dummy_condition = torch.randn(2, hidden_size, device=device, dtype=dtype)
+        dummy_speech = torch.randn(2, latent_dim, device=device, dtype=dtype)
+        for t in model.noise_scheduler.timesteps:
+            half = dummy_speech[:1]
+            combined = torch.cat([half, half], dim=0)
+            _ = model.prediction_head(
+                combined,
+                t.repeat(combined.shape[0]).to(combined),
+                condition=dummy_condition,
+            )
+            dummy_eps = torch.randn_like(dummy_speech)
+            dummy_speech = model.noise_scheduler.step(dummy_eps, t, dummy_speech).prev_sample
+        # 4. Warmup language model with KV cache path
+        dummy_ids = torch.randint(0, 32000, (1, 64), device=device)
+        dummy_mask = torch.ones_like(dummy_ids)
+        _ = model.model.language_model(input_ids=dummy_ids, attention_mask=dummy_mask, use_cache=True)
+        # 5. Warmup acoustic encoder (for voice prompts)
+        dummy_voice = torch.randn(1, 1, 24000, device=device, dtype=dtype)
+        _ = model.acoustic_tokenizer.encode(dummy_voice)
+        # 6. Run a minimal generation to warmup the full generation path
+        if processor is not None:
+            dummy_inputs = processor(text="Hi.", return_tensors="pt")
+            dummy_inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in dummy_inputs.items()}
+            _ = model.generate(**dummy_inputs, cfg_scale=3.0, max_new_tokens=10, show_progress=False)
+    # Clear memory
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+def load_models(model_id: str = "kugelaudio/kugelaudio-0-open"):
+    """Load model and processor. Switches model if a different model_id is requested."""
+    global _model, _processor, _watermark, _current_model_id
+    from kugelaudio_open.models import KugelAudioForConditionalGenerationInference
+    from kugelaudio_open.processors import KugelAudioProcessor
+    from kugelaudio_open.watermark import AudioWatermark
+    device = get_device()
+    dtype = torch.bfloat16 if device == "cuda" else torch.float32
+    # Check if we need to load a different model
+    if _model is None or _current_model_id != model_id:
+        # Clean up old model if switching
+        if _model is not None and _current_model_id != model_id:
+            print(f"Switching model from {_current_model_id} to {model_id}...")
+            del _model
+            del _processor
+            _model = None
+            _processor = None
+            # Clear CUDA cache to free memory
+            if device == "cuda":
+                torch.cuda.empty_cache()
+        print(f"Loading model {model_id} on {device}...")
+        try:
+            _model = KugelAudioForConditionalGenerationInference.from_pretrained(
+                model_id,
+                torch_dtype=dtype,
+                attn_implementation="flash_attention_2" if device == "cuda" else "sdpa",
+            ).to(device)
+        except Exception:
+            _model = KugelAudioForConditionalGenerationInference.from_pretrained(
+                model_id,
+                torch_dtype=dtype,
+            ).to(device)
+        _model.eval()
+        _current_model_id = model_id
+        print(f"Model {model_id} loaded!")
+    if _processor is None:
+        _processor = KugelAudioProcessor.from_pretrained(model_id)
+    # Warmup to eliminate first-generation slowness from CUDA kernel compilation
+    # Do this after processor is loaded so we can run a mini-generation
+    if device == "cuda" and _model is not None:
+        # Check if we need to warmup (only on first load)
+        if not getattr(_model, "_warmed_up", False):
+            print("Warming up model (this may take a moment)...")
+            _warmup_model(_model, _processor)
+            _model._warmed_up = True
+            print("Warmup complete!")
+    if _watermark is None:
+        _watermark = AudioWatermark(device=device)
+    return _model, _processor, _watermark
+def generate_speech(
+    text: str,
+    reference_audio: Optional[Tuple[int, np.ndarray]] = None,
+    model_choice: str = "kugelaudio-0-open",
+    cfg_scale: float = 3.0,
+    max_tokens: int = 2048,
+) -> Tuple[int, np.ndarray]:
+    """Generate speech from text.
+    Args:
+        text: Text to synthesize
+        reference_audio: Optional (sample_rate, audio_array) for voice cloning
+        model_choice: Model variant to use
+        cfg_scale: Classifier-free guidance scale
+        max_tokens: Maximum generation tokens
+    Returns:
+        Tuple of (sample_rate, audio_array)
+    Note:
+        All generated audio is automatically watermarked for identification.
+    """
+    if not text.strip():
+        raise gr.Error("Please enter some text to synthesize.")
+    model_id = f"kugelaudio/{model_choice}"
+    model, processor, watermark = load_models(model_id)
+    device = next(model.parameters()).device
+    # Process reference audio if provided
+    voice_audio = None
+    if reference_audio is not None:
+        ref_sr, ref_audio = reference_audio
+        print(f"[Voice Cloning] Input audio: sr={ref_sr}, shape={ref_audio.shape}, dtype={ref_audio.dtype}")
+        # Convert to float32 and normalize based on dtype
+        if ref_audio.dtype == np.int16:
+            ref_audio = ref_audio.astype(np.float32) / 32768.0
+        elif ref_audio.dtype == np.int32:
+            ref_audio = ref_audio.astype(np.float32) / 2147483648.0
+        elif ref_audio.dtype == np.float64:
+            ref_audio = ref_audio.astype(np.float32)
+        elif ref_audio.dtype != np.float32:
+            ref_audio = ref_audio.astype(np.float32)
+        # Ensure mono BEFORE resampling (important for stereo files)
+        if ref_audio.ndim > 1:
+            if ref_audio.shape[0] == 2:  # [2, samples] format (channels first)
+                ref_audio = ref_audio.mean(axis=0)
+            elif ref_audio.shape[-1] == 2:  # [samples, 2] format (channels last)
+                ref_audio = ref_audio.mean(axis=-1)
+            elif ref_audio.shape[0] < ref_audio.shape[-1]:  # Likely [channels, samples]
+                ref_audio = ref_audio.mean(axis=0)
+            else:  # Likely [samples, channels]
+                ref_audio = ref_audio.mean(axis=-1)
+        # Ensure 1D
+        ref_audio = ref_audio.squeeze()
+        print(f"[Voice Cloning] After mono conversion: shape={ref_audio.shape}, dtype={ref_audio.dtype}")
+        # Resample to 24kHz if needed - this is CRITICAL for voice cloning
+        if ref_sr != 24000:
+            import librosa
+            print(f"[Voice Cloning] Resampling from {ref_sr}Hz to 24000Hz (ratio: {ref_sr/24000:.4f})")
+            ref_audio = librosa.resample(ref_audio, orig_sr=ref_sr, target_sr=24000)
+            print(f"[Voice Cloning] After resampling: shape={ref_audio.shape}, duration={len(ref_audio)/24000:.2f}s")
+        else:
+            print(f"[Voice Cloning] No resampling needed, already at 24kHz")
+        # Normalize audio to reasonable range
+        max_val = np.abs(ref_audio).max()
+        if max_val > 0:
+            ref_audio = ref_audio / max_val * 0.95
+        voice_audio = ref_audio
+        print(f"[Voice Cloning] Final voice audio: shape={voice_audio.shape}, min={voice_audio.min():.4f}, max={voice_audio.max():.4f}, std={voice_audio.std():.4f}")
+    # Process text input with optional voice prompt
+    inputs = processor(text=text.strip(), voice_prompt=voice_audio, return_tensors="pt")
+    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+    print(f"[Generation] Using model: {model_id}, cfg_scale={cfg_scale}, max_tokens={max_tokens}")
+    # Generate
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            cfg_scale=cfg_scale,
+            max_new_tokens=max_tokens,
+        )
+    if not outputs.speech_outputs or outputs.speech_outputs[0] is None:
+        raise gr.Error("Generation failed. Please try again with different settings.")
+    # Audio is already watermarked by the model's generate method
+    audio = outputs.speech_outputs[0]
+    print(f"[Generation] Raw output: shape={audio.shape}, dtype={audio.dtype}")
+    # Convert to numpy (convert to float32 first since numpy doesn't support bfloat16)
+    if isinstance(audio, torch.Tensor):
+        audio = audio.cpu().float().numpy()
+    # Ensure correct shape (1D array)
+    audio = audio.squeeze()
+    # Normalize to prevent clipping (important for Gradio playback)
+    max_val = np.abs(audio).max()
+    if max_val > 1.0:
+        audio = audio / max_val * 0.95
+    print(f"[Generation] Final output: shape={audio.shape}, dtype={audio.dtype}, duration={len(audio)/24000:.2f}s")
+    print(f"[Generation] Audio stats: min={audio.min():.4f}, max={audio.max():.4f}, std={audio.std():.4f}")
+    # Return with explicit sample rate - Gradio expects (sample_rate, audio_array)
+    return (24000, audio)
+def check_watermark(audio: Tuple[int, np.ndarray]) -> str:
+    """Check if audio contains KugelAudio watermark."""
+    if audio is None:
+        return "No audio provided."
+    from kugelaudio_open.watermark import AudioWatermark
+    sr, audio_data = audio
+    # Convert to float32 if needed
+    if audio_data.dtype == np.int16:
+        audio_data = audio_data.astype(np.float32) / 32768.0
+    elif audio_data.dtype == np.int32:
+        audio_data = audio_data.astype(np.float32) / 2147483648.0
+    watermark = AudioWatermark()
+    result = watermark.detect(audio_data, sample_rate=sr)
+    if result.detected:
+        return f"✅ **Watermark Detected**\n\nConfidence: {result.confidence:.1%}\n\nThis audio was generated by KugelAudio."
+    else:
+        return f"❌ **No Watermark Detected**\n\nConfidence: {result.confidence:.1%}\n\nThis audio does not appear to be generated by KugelAudio."
+def create_app() -> "gr.Blocks":
+    """Create the Gradio application."""
+    if not GRADIO_AVAILABLE:
+        raise ImportError("Gradio not installed. Install with: pip install gradio")
+    # Logo URLs
+    kugelaudio_logo = "https://www.kugelaudio.com/logos/Logo%20Short.svg"
+    kisz_logo = "https://docs.sc.hpi.de/attachments/aisc/aisc-logo.png"
+    bmftr_logo = (
+        "https://hpi.de/fileadmin/_processed_/a/3/csm_BMFTR_de_Web_RGB_gef_durch_cd1f5345bd.jpg"
+    )
+    with gr.Blocks(title="KugelAudio - Text to Speech") as app:
+        gr.HTML(
+            f"""
+        <div style="text-align: center; margin-bottom: 1.5rem;">
+            <h1 style="margin-bottom: 0.5rem;">🎙️ KugelAudio</h1>
+            <p style="color: #666; margin-bottom: 1rem;">Open-source text-to-speech with voice cloning capabilities</p>
+            <div style="display: flex; justify-content: center; align-items: center; gap: 2rem; flex-wrap: wrap;">
+                <a href="https://kugelaudio.com" target="_blank">
+                    <img src="{kugelaudio_logo}" alt="KugelAudio" style="height: 50px; width: auto;">
+                </a>
+                <a href="https://hpi.de/ki-servicezentrum/" target="_blank">
+                    <img src="{kisz_logo}" alt="KI-Servicezentrum Berlin-Brandenburg" style="height: 50px; width: auto;">
+                </a>
+                <a href="https://www.bmftr.bund.de" target="_blank">
+                    <img src="{bmftr_logo}" alt="Gefördert durch BMFTR" style="height: 70px; width: auto;">
+                </a>
+            </div>
+        </div>
+        """
+        )
+        with gr.Tabs():
+            # Tab 1: Text to Speech
+            with gr.TabItem("🗣️ Generate Speech"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        text_input = gr.Textbox(
+                            label="Text to Synthesize",
+                            placeholder="Enter the text you want to convert to speech...",
+                            lines=5,
+                            max_lines=20,
+                        )
+                        reference_audio = gr.Audio(
+                            label="Reference Audio (Optional)",
+                            type="numpy",
+                            sources=["upload", "microphone"],
+                        )
+                        gr.Markdown("*Upload a voice sample to clone the speaker's voice*")
+                        with gr.Accordion("Advanced Settings", open=False):
+                            model_choice = gr.Dropdown(
+                                choices=["kugelaudio-0-open"],
+                                value="kugelaudio-0-open",
+                                label="Model",
+                            )
+                            cfg_scale = gr.Slider(
+                                minimum=1.0,
+                                maximum=10.0,
+                                value=3.0,
+                                step=0.5,
+                                label="Guidance Scale",
+                                info="Higher values = more adherence to text",
+                            )
+                            max_tokens = gr.Slider(
+                                minimum=512,
+                                maximum=8192,
+                                value=2048,
+                                step=256,
+                                label="Max Tokens",
+                                info="Maximum generation length",
+                            )
+                        generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
+                    with gr.Column(scale=1):
+                        output_audio = gr.Audio(
+                            label="Generated Speech",
+                            type="numpy",
+                            interactive=False,
+                        )
+                        gr.Markdown(
+                            """
+                        ### Tips
+                        - For best results, use clear and well-punctuated text
+                        - Reference audio should be 5-30 seconds of clear speech
+                        - The 7B model produces higher quality but is slower
+                        """
+                        )
+                generate_btn.click(
+                    fn=generate_speech,
+                    inputs=[text_input, reference_audio, model_choice, cfg_scale, max_tokens],
+                    outputs=[output_audio],
+                )
+            # Tab 2: Watermark Detection
+            with gr.TabItem("🔍 Verify Watermark"):
+                gr.Markdown(
+                    """
+                ### Watermark Verification
+                Check if an audio file was generated by KugelAudio. All audio generated
+                by KugelAudio contains an imperceptible watermark for identification.
+                """
+                )
+                with gr.Row():
+                    with gr.Column():
+                        verify_audio = gr.Audio(
+                            label="Audio to Verify",
+                            type="numpy",
+                            sources=["upload"],
+                        )
+                        verify_btn = gr.Button("🔍 Check Watermark", variant="secondary")
+                    with gr.Column():
+                        verify_result = gr.Markdown(
+                            label="Result",
+                            value="Upload an audio file to check for watermark.",
+                        )
+                verify_btn.click(
+                    fn=check_watermark,
+                    inputs=[verify_audio],
+                    outputs=[verify_result],
+                )
+            # Tab 3: About
+            with gr.TabItem("ℹ️ About"):
+                gr.Markdown(
+                    """
+                ## About KugelAudio
+                KugelAudio is an open-source text-to-speech system that combines:
+                - **AR + Diffusion Architecture**: Uses autoregressive language modeling
+                  with diffusion-based speech synthesis for high-quality output
+                - **Voice Cloning**: Clone any voice with just a few seconds of reference audio
+                - **Audio Watermarking**: All generated audio contains an imperceptible watermark
+                  using [Facebook's AudioSeal](https://huggingface.co/facebook/audioseal) technology
+                ### Models
+                | Model | Parameters | Quality | Speed |
+                |-------|------------|---------|-------|
+                | kugelaudio-0-open | 7B | Best | Standard |
+                ### Responsible Use
+                This technology is intended for legitimate purposes such as:
+                - Accessibility (text-to-speech for visually impaired)
+                - Content creation (podcasts, videos, audiobooks)
+                - Voice assistants and chatbots
+                **Please do not use this technology for:**
+                - Creating deepfakes or misleading content
+                - Impersonating individuals without consent
+                - Any illegal or harmful purposes
+                All generated audio is watermarked to enable detection.
+                """
+                )
+        gr.HTML(
+            """
+        <div style="text-align: center; margin-top: 2rem; padding: 1rem; border-top: 1px solid #eee;">
+            <p style="color: #888; margin-bottom: 0.5rem;">
+                <strong>KugelAudio</strong> • Open Source TTS with Voice Cloning
+            </p>
+            <p style="color: #aaa; font-size: 0.9rem;">
+                Created by <a href="mailto:kajo@kugelaudio.com" style="color: #667eea;">Kajo Kratzenstein</a> •
+                <a href="https://kugelaudio.com" style="color: #667eea;">kugelaudio.com</a> •
+                <a href="https://github.com/kugelaudio/kugelaudio" style="color: #667eea;">GitHub</a>
+            </p>
+        </div>
+        """
+        )
+    return app
+def launch_app(
+    share: bool = False,
+    server_name: str = "127.0.0.1",
+    server_port: int = 7860,
+    **kwargs,
+):
+    """Launch the Gradio web interface.
+    Args:
+        share: Create a public share link
+        server_name: Server hostname (use "0.0.0.0" for network access)
+        server_port: Server port
+        **kwargs: Additional arguments passed to gr.Blocks.launch()
+    """
+    app = create_app()
+    app.launch(
+        share=share,
+        server_name=server_name,
+        server_port=server_port,
+        theme=gr.themes.Soft(
+            primary_hue="indigo",
+            secondary_hue="slate",
+        ),
+        **kwargs,
+    )
+if __name__ == "__main__":
+    launch_app()

kugelaudio_open/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Utility functions for KugelAudio."""
+from kugelaudio_open.utils.generation import generate_speech, load_model_and_processor
+__all__ = ["generate_speech", "load_model_and_processor"]

kugelaudio_open/utils/generation.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""High-level generation utilities for KugelAudio."""
+from typing import Optional, Union, Tuple
+import torch
+def load_model_and_processor(
+    model_name_or_path: str = "kugelaudio/kugelaudio-0-open",
+    device: Optional[Union[str, torch.device]] = None,
+    torch_dtype: Optional[torch.dtype] = None,
+    use_flash_attention: bool = True,
+):
+    """Load KugelAudio model and processor.
+    Args:
+        model_name_or_path: HuggingFace model ID or local path
+        device: Device to load model on (auto-detected if None)
+        torch_dtype: Data type for model weights
+        use_flash_attention: Whether to use flash attention if available
+    Returns:
+        Tuple of (model, processor)
+    Example:
+        >>> model, processor = load_model_and_processor("kugelaudio/kugelaudio-0-open")
+    """
+    from kugelaudio_open.models import KugelAudioForConditionalGenerationInference
+    from kugelaudio_open.processors import KugelAudioProcessor
+    # Auto-detect device
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Auto-detect dtype
+    if torch_dtype is None:
+        torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+    # Load model
+    attn_impl = "flash_attention_2" if use_flash_attention else "sdpa"
+    try:
+        model = KugelAudioForConditionalGenerationInference.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch_dtype,
+            attn_implementation=attn_impl,
+        ).to(device)
+    except Exception:
+        # Fallback without flash attention
+        model = KugelAudioForConditionalGenerationInference.from_pretrained(
+            model_name_or_path,
+            torch_dtype=torch_dtype,
+        ).to(device)
+    model.eval()
+    # Load processor
+    processor = KugelAudioProcessor.from_pretrained(model_name_or_path)
+    return model, processor
+def generate_speech(
+    model,
+    processor,
+    text: str,
+    voice_prompt: Optional[torch.Tensor] = None,
+    voice_prompt_path: Optional[str] = None,
+    cfg_scale: float = 3.0,
+    max_new_tokens: int = 4096,
+    device: Optional[Union[str, torch.device]] = None,
+) -> torch.Tensor:
+    """Generate speech from text.
+    All generated audio is automatically watermarked for identification.
+    Args:
+        model: KugelAudio model
+        processor: KugelAudio processor
+        text: Text to synthesize
+        voice_prompt: Voice prompt tensor for speaker identity
+        voice_prompt_path: Path to voice prompt audio file
+        cfg_scale: Classifier-free guidance scale
+        max_new_tokens: Maximum number of tokens to generate
+        device: Device for generation
+    Returns:
+        Generated audio tensor (watermarked)
+    Example:
+        >>> audio = generate_speech(model, processor, "Hello world!")
+        >>> processor.save_audio(audio, "output.wav")
+    """
+    if device is None:
+        device = next(model.parameters()).device
+    # Load voice prompt if path provided
+    if voice_prompt is None and voice_prompt_path is not None:
+        voice_data = processor.audio_processor(voice_prompt_path, return_tensors="pt")
+        voice_prompt = voice_data["audio"].to(device)
+    # Process inputs
+    inputs = processor(text=text, return_tensors="pt")
+    inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+    # Generate (watermark is automatically applied by the model)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            voice_prompt=voice_prompt,
+            cfg_scale=cfg_scale,
+            max_new_tokens=max_new_tokens,
+        )
+    audio = outputs.speech_outputs[0] if outputs.speech_outputs else None
+    if audio is None:
+        raise RuntimeError("Generation failed - no audio output")
+    return audio

kugelaudio_open/watermark/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Audio watermarking for KugelAudio generated speech."""
+from kugelaudio_open.watermark.watermark import AudioWatermark
+__all__ = ["AudioWatermark"]

kugelaudio_open/watermark/watermark.py ADDED Viewed

	@@ -0,0 +1,390 @@

+"""Audio watermarking for KugelAudio using Facebook's AudioSeal.
+AudioSeal provides state-of-the-art speech localized watermarking with:
+- High robustness to audio editing and compression
+- Fast single-pass detection (real-time capable)
+- Sample-level detection (1/16k second resolution)
+- Optional 16-bit message embedding
+Reference: https://huggingface.co/facebook/audioseal
+"""
+from typing import Optional, Union, Tuple
+from dataclasses import dataclass
+import warnings
+import numpy as np
+import torch
+# Try to import AudioSeal
+try:
+    from audioseal import AudioSeal
+    AUDIOSEAL_AVAILABLE = True
+except ImportError:
+    AUDIOSEAL_AVAILABLE = False
+    warnings.warn(
+        "AudioSeal not installed. Install with: pip install audioseal\n"
+        "Watermarking will use fallback implementation."
+    )
+@dataclass
+class WatermarkResult:
+    """Result of watermark detection."""
+    detected: bool
+    confidence: float
+    message: Optional[torch.Tensor] = None
+    frame_probabilities: Optional[torch.Tensor] = None
+class AudioWatermark:
+    """Professional audio watermarking using Facebook's AudioSeal.
+    AudioSeal is a state-of-the-art watermarking system that embeds
+    imperceptible watermarks in audio that are robust to various
+    audio transformations.
+    Features:
+    - Imperceptible watermarks with minimal quality degradation
+    - Robust to compression, resampling, and editing
+    - Fast detection suitable for real-time applications
+    - Optional 16-bit message embedding for tracking
+    Example:
+        >>> watermark = AudioWatermark()
+        >>> watermarked_audio = watermark.embed(audio)
+        >>> result = watermark.detect(watermarked_audio)
+        >>> print(f"Detected: {result.detected}, Confidence: {result.confidence:.2%}")
+    Args:
+        model_name: AudioSeal model variant ("audioseal_wm_16bits")
+        device: Device for inference ("cuda" or "cpu")
+        message: Optional 16-bit message to embed (for tracking)
+    """
+    # Default message identifying KugelAudio-generated content
+    KUGELAUDIO_MESSAGE = torch.tensor([[1, 0, 1, 0, 1, 0, 1, 0,
+                                        0, 1, 0, 1, 0, 1, 0, 1]])  # Alternating pattern
+    # AudioSeal expects 16kHz audio
+    AUDIOSEAL_SAMPLE_RATE = 16000
+    def __init__(
+        self,
+        model_name: str = "audioseal_wm_16bits",
+        detector_name: str = "audioseal_detector_16bits",
+        device: Optional[Union[str, torch.device]] = None,
+        message: Optional[torch.Tensor] = None,
+    ):
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = torch.device(device)
+        self._generator = None
+        self._detector = None
+        self._model_name = model_name
+        self._detector_name = detector_name
+        # Use KugelAudio identifier message by default
+        self.message = message if message is not None else self.KUGELAUDIO_MESSAGE.clone()
+        if not AUDIOSEAL_AVAILABLE:
+            warnings.warn(
+                "AudioSeal not available. Watermarking disabled. "
+                "Install with: pip install audioseal"
+            )
+    @property
+    def generator(self):
+        """Lazy load the generator model."""
+        if self._generator is None and AUDIOSEAL_AVAILABLE:
+            self._generator = AudioSeal.load_generator(self._model_name)
+            self._generator = self._generator.to(self.device)
+            self._generator.eval()
+        return self._generator
+    @property
+    def detector(self):
+        """Lazy load the detector model."""
+        if self._detector is None and AUDIOSEAL_AVAILABLE:
+            self._detector = AudioSeal.load_detector(self._detector_name)
+            self._detector = self._detector.to(self.device)
+            self._detector.eval()
+        return self._detector
+    def _resample(
+        self,
+        audio: torch.Tensor,
+        orig_sr: int,
+        target_sr: int
+    ) -> torch.Tensor:
+        """Resample audio to target sample rate."""
+        if orig_sr == target_sr:
+            return audio
+        try:
+            import torchaudio.functional as F
+            return F.resample(audio, orig_sr, target_sr)
+        except ImportError:
+            # Fallback using scipy
+            from scipy import signal
+            audio_np = audio.cpu().numpy()
+            num_samples = int(len(audio_np.flatten()) * target_sr / orig_sr)
+            resampled = signal.resample(audio_np.flatten(), num_samples)
+            return torch.from_numpy(resampled).reshape(audio.shape[0], audio.shape[1], -1).to(audio.device)
+    def embed(
+        self,
+        audio: Union[np.ndarray, torch.Tensor],
+        sample_rate: int = 24000,
+        message: Optional[torch.Tensor] = None,
+    ) -> Union[np.ndarray, torch.Tensor]:
+        """Embed watermark into audio.
+        The watermark is imperceptible and robust to various audio
+        transformations including compression and resampling.
+        Args:
+            audio: Input audio of shape (samples,), (channels, samples),
+                   or (batch, channels, samples)
+            sample_rate: Sample rate of input audio (default: 24000 for KugelAudio)
+            message: Optional 16-bit message to embed
+        Returns:
+            Watermarked audio with same shape and type as input
+        """
+        if not AUDIOSEAL_AVAILABLE:
+            # Return unchanged if AudioSeal not available
+            return audio
+        # Track input type
+        is_numpy = isinstance(audio, np.ndarray)
+        if is_numpy:
+            audio = torch.from_numpy(audio)
+        original_device = audio.device
+        original_dtype = audio.dtype
+        # Ensure float32 for processing
+        audio = audio.float()
+        # Handle different input shapes
+        original_shape = audio.shape
+        if audio.ndim == 1:
+            # (samples,) -> (1, 1, samples)
+            audio = audio.unsqueeze(0).unsqueeze(0)
+        elif audio.ndim == 2:
+            # (channels, samples) -> (1, channels, samples)
+            audio = audio.unsqueeze(0)
+        # Move to device
+        audio = audio.to(self.device)
+        # Resample to 16kHz for AudioSeal
+        if sample_rate != self.AUDIOSEAL_SAMPLE_RATE:
+            audio_16k = self._resample(audio, sample_rate, self.AUDIOSEAL_SAMPLE_RATE)
+        else:
+            audio_16k = audio
+        # Prepare message
+        msg = message if message is not None else self.message
+        msg = msg.to(self.device)
+        if msg.shape[0] != audio_16k.shape[0]:
+            msg = msg.expand(audio_16k.shape[0], -1)
+        # Generate watermark at 16kHz
+        with torch.no_grad():
+            watermark_16k = self.generator.get_watermark(audio_16k, self.AUDIOSEAL_SAMPLE_RATE, message=msg)
+        # Resample watermark back to original sample rate
+        if sample_rate != self.AUDIOSEAL_SAMPLE_RATE:
+            watermark = self._resample(watermark_16k, self.AUDIOSEAL_SAMPLE_RATE, sample_rate)
+            # Ensure same length as original
+            if watermark.shape[-1] != audio.shape[-1]:
+                if watermark.shape[-1] > audio.shape[-1]:
+                    watermark = watermark[..., :audio.shape[-1]]
+                else:
+                    watermark = torch.nn.functional.pad(
+                        watermark, (0, audio.shape[-1] - watermark.shape[-1])
+                    )
+            # Re-fetch original audio at original sample rate
+            audio = self._resample(audio_16k, self.AUDIOSEAL_SAMPLE_RATE, sample_rate)
+            if audio.shape[-1] != original_shape[-1] if len(original_shape) > 0 else True:
+                # Adjust to match original length
+                target_len = original_shape[-1] if original_shape else watermark.shape[-1]
+                if audio.shape[-1] > target_len:
+                    audio = audio[..., :target_len]
+                    watermark = watermark[..., :target_len]
+        else:
+            watermark = watermark_16k
+        # Add watermark to audio
+        watermarked = audio + watermark
+        # Prevent clipping
+        max_val = watermarked.abs().max()
+        if max_val > 1.0:
+            watermarked = watermarked / max_val
+        # Restore original shape
+        if len(original_shape) == 1:
+            watermarked = watermarked.squeeze(0).squeeze(0)
+        elif len(original_shape) == 2:
+            watermarked = watermarked.squeeze(0)
+        # Restore device and dtype
+        watermarked = watermarked.to(device=original_device, dtype=original_dtype)
+        # Convert back to numpy if input was numpy
+        if is_numpy:
+            watermarked = watermarked.numpy()
+        return watermarked
+    def detect(
+        self,
+        audio: Union[np.ndarray, torch.Tensor],
+        sample_rate: int = 24000,
+        threshold: float = 0.5,
+    ) -> WatermarkResult:
+        """Detect watermark in audio.
+        Args:
+            audio: Input audio to check for watermark
+            sample_rate: Sample rate of input audio
+            threshold: Detection threshold (0.0-1.0)
+        Returns:
+            WatermarkResult with detection status, confidence, and decoded message
+        """
+        if not AUDIOSEAL_AVAILABLE:
+            return WatermarkResult(
+                detected=False,
+                confidence=0.0,
+                message=None,
+                frame_probabilities=None,
+            )
+        # Convert to tensor if needed
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        audio = audio.float()
+        # Handle different input shapes
+        if audio.ndim == 1:
+            audio = audio.unsqueeze(0).unsqueeze(0)
+        elif audio.ndim == 2:
+            audio = audio.unsqueeze(0)
+        audio = audio.to(self.device)
+        # Resample to 16kHz
+        if sample_rate != self.AUDIOSEAL_SAMPLE_RATE:
+            audio = self._resample(audio, sample_rate, self.AUDIOSEAL_SAMPLE_RATE)
+        # Detect watermark
+        with torch.no_grad():
+            result, message = self.detector(audio, self.AUDIOSEAL_SAMPLE_RATE)
+        # result shape: (batch, 2, frames) - probabilities for [no_watermark, watermark]
+        # Get positive (watermark present) probabilities
+        watermark_probs = result[:, 1, :]  # (batch, frames)
+        # Calculate overall confidence as mean of frame probabilities
+        confidence = watermark_probs.mean().item()
+        # Detection based on threshold
+        detected = confidence > threshold
+        return WatermarkResult(
+            detected=detected,
+            confidence=confidence,
+            message=message.cpu() if message is not None else None,
+            frame_probabilities=watermark_probs.cpu(),
+        )
+    def verify(self, audio: Union[np.ndarray, torch.Tensor], sample_rate: int = 24000) -> bool:
+        """Quick verification that audio contains KugelAudio watermark.
+        Args:
+            audio: Audio to verify
+            sample_rate: Sample rate of audio
+        Returns:
+            True if watermark detected with high confidence
+        """
+        result = self.detect(audio, sample_rate)
+        return result.detected and result.confidence > 0.6
+class WatermarkPostProcessor:
+    """Post-processor that automatically adds watermarks to generated audio.
+    Designed to be integrated into the generation pipeline to ensure
+    all generated audio is watermarked transparently.
+    Example:
+        >>> post_processor = WatermarkPostProcessor()
+        >>> # In generation pipeline:
+        >>> audio = model.generate(...)
+        >>> audio = post_processor(audio)  # Watermark added automatically
+    """
+    def __init__(
+        self,
+        enabled: bool = True,
+        device: Optional[Union[str, torch.device]] = None,
+        sample_rate: int = 24000,
+    ):
+        self.enabled = enabled
+        self.sample_rate = sample_rate
+        self._watermark = None
+        self._device = device
+    @property
+    def watermark(self) -> AudioWatermark:
+        """Lazy initialization of watermark model."""
+        if self._watermark is None:
+            self._watermark = AudioWatermark(device=self._device)
+        return self._watermark
+    def __call__(
+        self,
+        audio: Union[np.ndarray, torch.Tensor],
+        sample_rate: Optional[int] = None,
+    ) -> Union[np.ndarray, torch.Tensor]:
+        """Add watermark to audio if enabled."""
+        if not self.enabled:
+            return audio
+        sr = sample_rate or self.sample_rate
+        return self.watermark.embed(audio, sample_rate=sr)
+    def disable(self):
+        """Disable watermarking."""
+        self.enabled = False
+    def enable(self):
+        """Enable watermarking."""
+        self.enabled = True
+def is_watermarked(
+    audio: Union[np.ndarray, torch.Tensor],
+    sample_rate: int = 24000,
+    threshold: float = 0.5,
+) -> bool:
+    """Convenience function to check if audio is watermarked.
+    Args:
+        audio: Audio to check
+        sample_rate: Sample rate of audio
+        threshold: Detection threshold
+    Returns:
+        True if watermark detected
+    """
+    watermark = AudioWatermark()
+    result = watermark.detect(audio, sample_rate, threshold)
+    return result.detected