Spaces:

Lightricks
/

ltx-2

Paused

App Files Files Community

linoy commited on Jan 5

Commit

ebfc6b3

0 Parent(s):

inital commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +301 -0
packages/ltx-core/README.md +1 -0
packages/ltx-core/pyproject.toml +38 -0
packages/ltx-core/src/ltx_core/__init__.py +0 -0
packages/ltx-core/src/ltx_core/__pycache__/__init__.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/__pycache__/tiling.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/__pycache__/utils.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/guidance/__init__.py +0 -0
packages/ltx-core/src/ltx_core/guidance/__pycache__/__init__.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/guidance/__pycache__/perturbations.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/guidance/perturbations.py +74 -0
packages/ltx-core/src/ltx_core/legacy_tiling.py +258 -0
packages/ltx-core/src/ltx_core/loader/.ipynb_checkpoints/sd_ops-checkpoint.py +107 -0
packages/ltx-core/src/ltx_core/loader/__init__.py +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/__init__.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/fuse_loras.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/kernels.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/module_ops.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/primitives.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/registry.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/sd_ops.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/sft_loader.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/single_gpu_model_builder.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/fuse_loras.py +102 -0
packages/ltx-core/src/ltx_core/loader/kernels.py +74 -0
packages/ltx-core/src/ltx_core/loader/module_ops.py +11 -0
packages/ltx-core/src/ltx_core/loader/primitives.py +63 -0
packages/ltx-core/src/ltx_core/loader/registry.py +68 -0
packages/ltx-core/src/ltx_core/loader/sd_ops.py +107 -0
packages/ltx-core/src/ltx_core/loader/sft_loader.py +53 -0
packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py +99 -0
packages/ltx-core/src/ltx_core/model/.ipynb_checkpoints/model_ledger-checkpoint.py +253 -0
packages/ltx-core/src/ltx_core/model/__init__.py +0 -0
packages/ltx-core/src/ltx_core/model/__pycache__/__init__.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/__pycache__/model_ledger.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/__pycache__/model_protocol.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__init__.py +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/__init__.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/attention.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/audio_vae.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/causal_conv_2d.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/causality_axis.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/downsample.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/model_configurator.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/ops.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/resnet.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/upsample.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/vocoder.cpython-310.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/attention.py +71 -0
packages/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py +483 -0

app.py ADDED Viewed

	@@ -0,0 +1,301 @@

+"""
+Simple Gradio app for LTX-2 inference based on ltx2_two_stage.py example
+"""
+import sys
+from pathlib import Path
+# Add packages to Python path
+current_dir = Path(__file__).parent
+sys.path.insert(0, str(current_dir / "packages" / "ltx-pipelines" / "src"))
+sys.path.insert(0, str(current_dir / "packages" / "ltx-core" / "src"))
+import gradio as gr
+from typing import Optional
+from huggingface_hub import hf_hub_download
+from ltx_pipelines.ti2vid_two_stages import TI2VidTwoStagesPipeline
+from ltx_core.tiling import TilingConfig
+from ltx_pipelines.constants import (
+    DEFAULT_SEED,
+    DEFAULT_HEIGHT,
+    DEFAULT_WIDTH,
+    DEFAULT_NUM_FRAMES,
+    DEFAULT_FRAME_RATE,
+    DEFAULT_NUM_INFERENCE_STEPS,
+    DEFAULT_CFG_GUIDANCE_SCALE,
+    DEFAULT_LORA_STRENGTH,
+)
+# Custom negative prompt
+DEFAULT_NEGATIVE_PROMPT = "shaky, glitchy, low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly, transition, static"
+# Default prompt from docstring example
+DEFAULT_PROMPT = "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot."
+# HuggingFace Hub defaults
+DEFAULT_REPO_ID = "LTX-Colab/LTX-Video-Preview"
+DEFAULT_GEMMA_REPO_ID = "google/gemma-3-12b-it-qat-q4_0-unquantized"
+DEFAULT_CHECKPOINT_FILENAME = "ltx-2-19b-dev-rc1.safetensors"
+DEFAULT_DISTILLED_LORA_FILENAME = "ltx-2-19b-distilled-lora-384-rc1.safetensors"
+DEFAULT_SPATIAL_UPSAMPLER_FILENAME = "ltx-2-spatial-upscaler-x2-1.0-rc1.safetensors"
+def get_hub_or_local_checkpoint(repo_id: Optional[str] = None, filename: Optional[str] = None):
+    """Download from HuggingFace Hub or use local checkpoint."""
+    if repo_id is None and filename is None:
+        raise ValueError("Please supply at least one of `repo_id` or `filename`")
+    if repo_id is not None:
+        if filename is None:
+            raise ValueError("If repo_id is specified, filename must also be specified.")
+        print(f"Downloading {filename} from {repo_id}...")
+        ckpt_path = hf_hub_download(repo_id=repo_id, filename=filename)
+        print(f"Downloaded to {ckpt_path}")
+    else:
+        ckpt_path = filename
+    return ckpt_path
+# Initialize pipeline at startup
+print("=" * 80)
+print("Loading LTX-2 2-stage pipeline...")
+print("=" * 80)
+checkpoint_path = get_hub_or_local_checkpoint(DEFAULT_REPO_ID, DEFAULT_CHECKPOINT_FILENAME)
+distilled_lora_path = get_hub_or_local_checkpoint(DEFAULT_REPO_ID, DEFAULT_DISTILLED_LORA_FILENAME)
+spatial_upsampler_path = get_hub_or_local_checkpoint(DEFAULT_REPO_ID, DEFAULT_SPATIAL_UPSAMPLER_FILENAME)
+print(f"Initializing pipeline with:")
+print(f"  checkpoint_path={checkpoint_path}")
+print(f"  distilled_lora_path={distilled_lora_path}")
+print(f"  spatial_upsampler_path={spatial_upsampler_path}")
+print(f"  gemma_root={DEFAULT_GEMMA_REPO_ID}")
+pipeline = TI2VidTwoStagesPipeline(
+    checkpoint_path=checkpoint_path,
+    distilled_lora_path=distilled_lora_path,
+    distilled_lora_strength=DEFAULT_LORA_STRENGTH,
+    spatial_upsampler_path=spatial_upsampler_path,
+    gemma_root=DEFAULT_GEMMA_REPO_ID,
+    loras=[],
+    fp8transformer=False,
+    local_files_only=False
+)
+print("=" * 80)
+print("Warming up pipeline (loading Gemma text encoder)...")
+print("=" * 80)
+# Do a dummy warmup to load all models including Gemma
+import tempfile
+import os
+warmup_output = tempfile.mktemp(suffix=".mp4")
+try:
+    pipeline(
+        prompt="warmup",
+        negative_prompt="",
+        output_path=warmup_output,
+        seed=42,
+        height=256,
+        width=256,
+        num_frames=9,
+        frame_rate=8,
+        num_inference_steps=1,
+        cfg_guidance_scale=1.0,
+        images=[],
+        tiling_config=TilingConfig.default(),
+    )
+    # Clean up warmup output
+    if os.path.exists(warmup_output):
+        os.remove(warmup_output)
+except Exception as e:
+    print(f"Warmup completed with note: {e}")
+print("=" * 80)
+print("Pipeline fully loaded and ready!")
+print("=" * 80)
+def generate_video(
+    input_image,
+    prompt: str,
+    duration: float,
+    negative_prompt: str,
+    seed: int,
+    randomize_seed: bool,
+    num_inference_steps: int,
+    cfg_guidance_scale: float,
+    height: int,
+    width: int,
+    progress=gr.Progress()
+):
+    """Generate a video based on the given parameters."""
+    try:
+        # Randomize seed if checkbox is enabled
+        if randomize_seed:
+            import random
+            seed = random.randint(0, 1000000)
+        # Calculate num_frames from duration (using fixed 24 fps)
+        frame_rate = 24.0
+        num_frames = int(duration * frame_rate) + 1  # +1 to ensure we meet the duration
+        # Create output directory if it doesn't exist
+        output_dir = Path("outputs")
+        output_dir.mkdir(exist_ok=True)
+        output_path = output_dir / f"video_{seed}.mp4"
+        # Handle image input
+        images = []
+        if input_image is not None:
+            # Save uploaded image temporarily
+            temp_image_path = output_dir / f"temp_input_{seed}.jpg"
+            if hasattr(input_image, 'save'):
+                input_image.save(temp_image_path)
+            else:
+                # If it's a file path already
+                temp_image_path = input_image
+            # Format: (image_path, frame_idx, strength)
+            images = [(str(temp_image_path), 0, 1.0)]
+        # Run inference
+        progress(0, desc="Generating video (2-stage)...")
+        pipeline(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            output_path=str(output_path),
+            seed=seed,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            frame_rate=frame_rate,
+            num_inference_steps=num_inference_steps,
+            cfg_guidance_scale=cfg_guidance_scale,
+            images=images,
+            tiling_config=TilingConfig.default(),
+        )
+        progress(1.0, desc="Done!")
+        return str(output_path)
+    except Exception as e:
+        import traceback
+        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return None
+# Create Gradio interface
+with gr.Blocks(title="LTX-2 Image-to-Video") as demo:
+    gr.Markdown("# LTX-2 Image-to-Video Generation")
+    gr.Markdown("Transform images into videos using the LTX-2 2-stage pipeline")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(
+                label="Input Image",
+                type="pil",
+                sources=["upload"]
+            )
+            prompt = gr.Textbox(
+                label="Prompt",
+                value="Make this image come alive with cinematic motion, smooth animation",
+                lines=3,
+                placeholder="Describe the motion and animation you want..."
+            )
+            duration = gr.Slider(
+                label="Duration (seconds)",
+                minimum=1.0,
+                maximum=10.0,
+                value=5.0,
+                step=0.1
+            )
+            generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
+            with gr.Accordion("Advanced Settings", open=False):
+                negative_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    value=DEFAULT_NEGATIVE_PROMPT,
+                    lines=2
+                )
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=1000000,
+                    value=DEFAULT_SEED,
+                    step=1
+                )
+                randomize_seed = gr.Checkbox(
+                    label="Randomize Seed",
+                    value=True
+                )
+                num_inference_steps = gr.Slider(
+                    label="Inference Steps",
+                    minimum=1,
+                    maximum=100,
+                    value=DEFAULT_NUM_INFERENCE_STEPS,
+                    step=1
+                )
+                cfg_guidance_scale = gr.Slider(
+                    label="CFG Guidance Scale",
+                    minimum=1.0,
+                    maximum=10.0,
+                    value=DEFAULT_CFG_GUIDANCE_SCALE,
+                    step=0.1
+                )
+                with gr.Row():
+                    width = gr.Number(
+                        label="Width",
+                        value=DEFAULT_WIDTH,
+                        precision=0
+                    )
+                    height = gr.Number(
+                        label="Height",
+                        value=DEFAULT_HEIGHT,
+                        precision=0
+                    )
+        with gr.Column():
+            output_video = gr.Video(label="Generated Video", autoplay=True)
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            input_image,
+            prompt,
+            duration,
+            negative_prompt,
+            seed,
+            randomize_seed,
+            num_inference_steps,
+            cfg_guidance_scale,
+            height,
+            width,
+        ],
+        outputs=output_video
+    )
+    # Add example
+    gr.Examples(
+        examples=[
+            [
+                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg",
+                "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot.",
+                5.0,
+            ]
+        ],
+        inputs=[input_image, prompt, duration],
+        label="Example"
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

packages/ltx-core/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # LTX-2 Core

packages/ltx-core/pyproject.toml ADDED Viewed

	@@ -0,0 +1,38 @@

+[project]
+name = "ltx-core"
+version = "0.1.0"
+description = "Core implementation of Lightricks' LTX-2 model"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "torch~=2.7",
+    "torchaudio",
+    "einops",
+    "numpy",
+    "transformers",
+    "safetensors",
+    "accelerate",
+    "scipy>=1.14",
+]
+[project.optional-dependencies]
+flashpack = ["flashpack==0.1.2"]
+xformers = ["xformers"]
+[tool.uv.sources]
+xformers = { index = "pytorch" }
+[[tool.uv.index]]
+name = "pytorch"
+url = "https://download.pytorch.org/whl/cu129"
+explicit = true
+[build-system]
+requires = ["uv_build>=0.9.8,<0.10.0"]
+build-backend = "uv_build"
+[dependency-groups]
+dev = [
+    "scikit-image>=0.25.2",
+]

packages/ltx-core/src/ltx_core/__init__.py ADDED Viewed

File without changes

packages/ltx-core/src/ltx_core/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (176 Bytes). View file

packages/ltx-core/src/ltx_core/__pycache__/tiling.cpython-310.pyc ADDED Viewed

Binary file (10.7 kB). View file

packages/ltx-core/src/ltx_core/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.85 kB). View file

packages/ltx-core/src/ltx_core/guidance/__init__.py ADDED Viewed

File without changes

packages/ltx-core/src/ltx_core/guidance/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (185 Bytes). View file

packages/ltx-core/src/ltx_core/guidance/__pycache__/perturbations.cpython-310.pyc ADDED Viewed

Binary file (3.82 kB). View file

packages/ltx-core/src/ltx_core/guidance/perturbations.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Andrew Kvochko
+from dataclasses import dataclass
+from enum import Enum
+import torch
+from torch._prims_common import DeviceLikeType
+class PerturbationType(Enum):
+    SKIP_A2V_CROSS_ATTN = "skip_a2v_cross_attn"
+    SKIP_V2A_CROSS_ATTN = "skip_v2a_cross_attn"
+    SKIP_VIDEO_SELF_ATTN = "skip_video_self_attn"
+    SKIP_AUDIO_SELF_ATTN = "skip_audio_self_attn"
+@dataclass(frozen=True)
+class Perturbation:
+    type: PerturbationType
+    blocks: list[int] | None  # None means all blocks
+    def is_perturbed(self, perturbation_type: PerturbationType, block: int) -> bool:
+        if self.type != perturbation_type:
+            return False
+        if self.blocks is None:
+            return True
+        return block in self.blocks
+@dataclass(frozen=True)
+class PerturbationConfig:
+    perturbations: list[Perturbation] | None
+    def is_perturbed(self, perturbation_type: PerturbationType, block: int) -> bool:
+        if self.perturbations is None:
+            return False
+        return any(perturbation.is_perturbed(perturbation_type, block) for perturbation in self.perturbations)
+    @staticmethod
+    def empty() -> "PerturbationConfig":
+        return PerturbationConfig([])
+@dataclass(frozen=True)
+class BatchedPerturbationConfig:
+    perturbations: list[PerturbationConfig]
+    def mask(
+        self, perturbation_type: PerturbationType, block: int, device: DeviceLikeType, dtype: torch.dtype
+    ) -> torch.Tensor:
+        mask = torch.ones((len(self.perturbations),), device=device, dtype=dtype)
+        for batch_idx, perturbation in enumerate(self.perturbations):
+            if perturbation.is_perturbed(perturbation_type, block):
+                mask[batch_idx] = 0
+        return mask
+    def mask_like(self, perturbation_type: PerturbationType, block: int, values: torch.Tensor) -> torch.Tensor:
+        mask = self.mask(perturbation_type, block, values.device, values.dtype)
+        return mask.view(mask.numel(), *([1] * len(values.shape[1:])))
+    def any_in_batch(self, perturbation_type: PerturbationType, block: int) -> bool:
+        return any(perturbation.is_perturbed(perturbation_type, block) for perturbation in self.perturbations)
+    def all_in_batch(self, perturbation_type: PerturbationType, block: int) -> bool:
+        return all(perturbation.is_perturbed(perturbation_type, block) for perturbation in self.perturbations)
+    @staticmethod
+    def empty(batch_size: int) -> "BatchedPerturbationConfig":
+        return BatchedPerturbationConfig([PerturbationConfig.empty() for _ in range(batch_size)])

packages/ltx-core/src/ltx_core/legacy_tiling.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import logging
+from collections.abc import Generator
+import torch
+from ltx_core.model.video_vae.video_vae import Decoder
+def compute_chunk_boundaries(
+    chunk_start: int,
+    temporal_tile_length: int,
+    temporal_overlap: int,
+    total_latent_frames: int,
+) -> tuple[int, int]:
+    """Compute chunk boundaries for temporal tiling.
+    Args:
+        chunk_start: Starting frame index for the current chunk
+        temporal_tile_length: Length of each temporal tile
+        temporal_overlap: Number of frames to overlap between chunks
+        total_latent_frames: Total number of latent frames
+    Returns:
+        Tuple of (overlap_start, chunk_end)
+    """
+    if chunk_start == 0:
+        # First chunk: no overlap needed
+        chunk_end = min(chunk_start + temporal_tile_length, total_latent_frames)
+        overlap_start = chunk_start
+    else:
+        # Subsequent chunks: include overlap from previous chunk
+        # -1 because we need one extra frame to overlap, which is decoded to a single frame
+        # never overlap with the first latent frame
+        overlap_start = max(1, chunk_start - temporal_overlap - 1)
+        extra_frames = chunk_start - overlap_start
+        chunk_end = min(
+            chunk_start + temporal_tile_length - extra_frames,
+            total_latent_frames,
+        )
+    return overlap_start, chunk_end
+def spatial_decode(  # noqa
+    decoder: Decoder,
+    samples: torch.Tensor,
+    horizontal_tiles: int,
+    vertical_tiles: int,
+    overlap: int,
+    last_frame_fix: bool,
+    scale_factors: tuple[float, float, float],
+    timestep: float,
+    generator: torch.Generator,
+) -> torch.Tensor:
+    if last_frame_fix:
+        # Repeat the last frame along dimension 2 (frames)
+        # samples shape - [batch, channels, frames, height, width]
+        last_frame = samples[:, :, -1:, :, :]
+        samples = torch.cat([samples, last_frame], dim=2)
+    batch, _, frames, height, width = samples.shape
+    time_scale_factor, width_scale_factor, height_scale_factor = scale_factors
+    image_frames = 1 + (frames - 1) * time_scale_factor
+    # Calculate output image dimensions
+    output_height = height * height_scale_factor
+    output_width = width * width_scale_factor
+    # Calculate tile sizes with overlap
+    base_tile_height = (height + (vertical_tiles - 1) * overlap) // vertical_tiles
+    base_tile_width = (width + (horizontal_tiles - 1) * overlap) // horizontal_tiles
+    # Initialize output tensor and weight tensor
+    # VAE decode returns images in format [batch, height, width, channels]
+    output = None
+    weights = None
+    target_device = samples.device
+    target_dtype = samples.dtype
+    output = torch.zeros(
+        (
+            batch,
+            3,
+            image_frames,
+            output_height,
+            output_width,
+        ),
+        device=target_device,
+        dtype=target_dtype,
+    )
+    weights = torch.zeros(
+        (batch, 1, image_frames, output_height, output_width),
+        device=target_device,
+        dtype=target_dtype,
+    )
+    # Process each tile
+    for v in range(vertical_tiles):
+        for h in range(horizontal_tiles):
+            # Calculate tile boundaries
+            h_start = h * (base_tile_width - overlap)
+            v_start = v * (base_tile_height - overlap)
+            # Adjust end positions for edge tiles
+            h_end = min(h_start + base_tile_width, width) if h < horizontal_tiles - 1 else width
+            v_end = min(v_start + base_tile_height, height) if v < vertical_tiles - 1 else height
+            # Calculate actual tile dimensions
+            tile_height = v_end - v_start
+            tile_width = h_end - h_start
+            logging.info(f"Processing VAE decode tile at row {v}, col {h}:")
+            logging.info(f"  Position: ({v_start}:{v_end}, {h_start}:{h_end})")
+            logging.info(f"  Size: {tile_height}x{tile_width}")
+            # Extract tile
+            tile = samples[:, :, :, v_start:v_end, h_start:h_end]
+            # Decode the tile
+            decoded_tile = decoder.decode(tile, timestep, generator)
+            # Calculate output tile boundaries
+            out_h_start = v_start * height_scale_factor
+            out_h_end = v_end * height_scale_factor
+            out_w_start = h_start * width_scale_factor
+            out_w_end = h_end * width_scale_factor
+            # Create weight mask for this tile
+            tile_out_height = out_h_end - out_h_start
+            tile_out_width = out_w_end - out_w_start
+            tile_weights = torch.ones(
+                (batch, 1, image_frames, tile_out_height, tile_out_width),
+                device=decoded_tile.device,
+                dtype=decoded_tile.dtype,
+            )
+            # Calculate overlap regions in output space
+            overlap_out_h = overlap * height_scale_factor
+            overlap_out_w = overlap * width_scale_factor
+            # Apply horizontal blending weights
+            if h > 0:  # Left overlap
+                h_blend = torch.linspace(0, 1, overlap_out_w, device=decoded_tile.device)
+                tile_weights[:, :, :, :, :overlap_out_w] *= h_blend
+            if h < horizontal_tiles - 1:  # Right overlap
+                h_blend = torch.linspace(1, 0, overlap_out_w, device=decoded_tile.device)
+                tile_weights[:, :, :, :, -overlap_out_w:] *= h_blend
+            # Apply vertical blending weights
+            if v > 0:  # Top overlap
+                v_blend = torch.linspace(0, 1, overlap_out_h, device=decoded_tile.device)
+                tile_weights[:, :, :, :overlap_out_h, :] *= v_blend.view(1, 1, 1, -1, 1)
+            if v < vertical_tiles - 1:  # Bottom overlap
+                v_blend = torch.linspace(1, 0, overlap_out_h, device=decoded_tile.device)
+                tile_weights[:, :, :, -overlap_out_h:, :] *= v_blend.view(1, 1, 1, -1, 1)
+            # Add weighted tile to output
+            output[:, :, :, out_h_start:out_h_end, out_w_start:out_w_end] += (decoded_tile * tile_weights).to(
+                target_device, target_dtype
+            )
+            # Add weights to weight tensor
+            weights[:, :, :, out_h_start:out_h_end, out_w_start:out_w_end] += tile_weights.to(
+                target_device, target_dtype
+            )
+    # Normalize by weights
+    output /= weights + 1e-8
+    # LT_INTERNAL: changed from output[:-time_scale_factor, :, :]!
+    if last_frame_fix:
+        output = output[:, :, :-time_scale_factor, :, :]
+    return output
+def decode_spatial_temporal(
+    decoder: Decoder,
+    samples: torch.ensor,
+    timestep: float,
+    generator: torch.Generator,
+    scale_factors: tuple[float, float, float],
+    spatial_tiles: int = 4,
+    spatial_overlap: int = 1,
+    temporal_tile_length: int = 16,
+    temporal_overlap: int = 1,
+    last_frame_fix: bool = False,
+) -> Generator[torch.Tensor, None, None]:
+    if temporal_tile_length < temporal_overlap + 1:
+        raise ValueError("Temporal tile length must be greater than temporal overlap + 1")
+    _, _, frames, _, _ = samples.shape
+    time_scale_factor, _, _ = scale_factors
+    # Process temporal chunks similar to reference function
+    total_latent_frames = frames
+    chunk_start = 0
+    previous_tile = None
+    while chunk_start < total_latent_frames:
+        # Calculate chunk boundaries
+        overlap_start, chunk_end = compute_chunk_boundaries(
+            chunk_start, temporal_tile_length, temporal_overlap, total_latent_frames
+        )
+        # units are latent frames
+        chunk_frames = chunk_end - overlap_start
+        logging.info(f"Processing temporal chunk: {overlap_start}:{chunk_end} ({chunk_frames} latent frames)")
+        # Extract tile
+        tile = samples[:, :, overlap_start:chunk_end]
+        # Decode the tile
+        decoded_tile = spatial_decode(
+            decoder,
+            tile,
+            spatial_tiles,
+            spatial_tiles,
+            spatial_overlap,
+            last_frame_fix,
+            scale_factors,
+            timestep,
+            generator,
+        )
+        if previous_tile is None:
+            previous_tile = decoded_tile
+        else:
+            # Drop first frame if needed (overlap)
+            if decoded_tile.shape[2] == 1:
+                raise ValueError("Dropping first frame but tile has only 1 frame")
+            decoded_tile = decoded_tile[:, :, 1:]  # Drop first frame
+            # Create weight mask for this tile
+            # -1 is for dropped frame above
+            overlap_frames = temporal_overlap * time_scale_factor
+            frame_weights = torch.linspace(
+                0,
+                1,
+                overlap_frames + 2,
+                device=decoded_tile.device,
+                dtype=decoded_tile.dtype,
+            )[1:-1]
+            tile_weights = frame_weights.view(1, 1, -1, 1, 1)
+            previous_tile[:, :, -overlap_frames:] = (
+                previous_tile[:, :, -overlap_frames:] * (1 - tile_weights)
+                + decoded_tile[:, :, :overlap_frames] * tile_weights
+            )
+            resulting_tile = previous_tile[:, :, :-overlap_frames]
+            decoded_tile[:, :, :overlap_frames] = previous_tile[:, :, -overlap_frames:]
+            yield resulting_tile
+            previous_tile = decoded_tile
+        # Move to next chunk
+        chunk_start = chunk_end
+    yield decoded_tile

packages/ltx-core/src/ltx_core/loader/.ipynb_checkpoints/sd_ops-checkpoint.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Alexey Kravtsov
+from dataclasses import dataclass, replace
+#from typing import NamedTuple, Protocol, Self
+from typing import NamedTuple, Protocol
+from typing_extensions import Self
+import torch
+@dataclass(frozen=True, slots=True)
+class ContentReplacement:
+    content: str
+    replacement: str
+@dataclass(frozen=True, slots=True)
+class ContentMatching:
+    prefix: str = ""
+    suffix: str = ""
+class KeyValueOperationResult(NamedTuple):
+    new_key: str
+    new_value: torch.Tensor
+class KeyValueOperation(Protocol):
+    def __call__(self, tensor_key: str, tensor_value: torch.Tensor) -> list[KeyValueOperationResult]: ...
+@dataclass(frozen=True, slots=True)
+class SDKeyValueOperation:
+    key_matcher: ContentMatching
+    kv_operation: KeyValueOperation
+@dataclass(frozen=True, slots=True)
+class SDOps:
+    """Immutable class representing state dict key operations."""
+    name: str
+    mapping: tuple[
+        ContentReplacement | ContentMatching | SDKeyValueOperation, ...
+    ] = ()  # Immutable tuple of (key, value) pairs
+    def with_replacement(self, content: str, replacement: str) -> Self:
+        """Create a new SDOps instance with the specified replacement added to the mapping."""
+        new_mapping = (*self.mapping, ContentReplacement(content, replacement))
+        return replace(self, mapping=new_mapping)
+    def with_matching(self, prefix: str = "", suffix: str = "") -> Self:
+        """Create a new SDOps instance with the specified prefix and suffix matching added to the mapping."""
+        new_mapping = (*self.mapping, ContentMatching(prefix, suffix))
+        return replace(self, mapping=new_mapping)
+    def with_kv_operation(
+        self,
+        operation: KeyValueOperation,
+        key_prefix: str = "",
+        key_suffix: str = "",
+    ) -> Self:
+        """Create a new SDOps instance with the specified value operation added to the mapping."""
+        key_matcher = ContentMatching(key_prefix, key_suffix)
+        sd_kv_operation = SDKeyValueOperation(key_matcher, operation)
+        new_mapping = (*self.mapping, sd_kv_operation)
+        return replace(self, mapping=new_mapping)
+    def apply_to_key(self, key: str) -> str | None:
+        """Apply the mapping to the given name."""
+        matchers = [content for content in self.mapping if isinstance(content, ContentMatching)]
+        valid = any(key.startswith(f.prefix) and key.endswith(f.suffix) for f in matchers)
+        if not valid:
+            return None
+        for replacement in self.mapping:
+            if not isinstance(replacement, ContentReplacement):
+                continue
+            if replacement.content in key:
+                key = key.replace(replacement.content, replacement.replacement)
+        return key
+    def apply_to_key_value(self, key: str, value: torch.Tensor) -> list[KeyValueOperationResult]:
+        """Apply the value operation to the given name and associated value."""
+        for operation in self.mapping:
+            if not isinstance(operation, SDKeyValueOperation):
+                continue
+            if key.startswith(operation.key_matcher.prefix) and key.endswith(operation.key_matcher.suffix):
+                return operation.kv_operation(key, value)
+        return [KeyValueOperationResult(key, value)]
+# Predefined SDOps instances
+LTXV_LORA_COMFY_RENAMING_MAP = (
+    SDOps("LTXV_LORA_COMFY_PREFIX_MAP").with_matching().with_replacement("diffusion_model.", "")
+)
+LTXV_LORA_COMFY_TARGET_MAP = (
+    SDOps("LTXV_LORA_COMFY_TARGET_MAP")
+    .with_matching()
+    .with_replacement("diffusion_model.", "")
+    .with_replacement(".lora_A.weight", ".weight")
+    .with_replacement(".lora_B.weight", ".weight")
+)

packages/ltx-core/src/ltx_core/loader/__init__.py ADDED Viewed

File without changes

packages/ltx-core/src/ltx_core/loader/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (183 Bytes). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/fuse_loras.cpython-310.pyc ADDED Viewed

Binary file (2.75 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/kernels.cpython-310.pyc ADDED Viewed

Binary file (1.58 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/module_ops.cpython-310.pyc ADDED Viewed

Binary file (558 Bytes). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/primitives.cpython-310.pyc ADDED Viewed

Binary file (3.09 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/registry.cpython-310.pyc ADDED Viewed

Binary file (3.53 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/sd_ops.cpython-310.pyc ADDED Viewed

Binary file (4.36 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/sft_loader.cpython-310.pyc ADDED Viewed

Binary file (2.65 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/single_gpu_model_builder.cpython-310.pyc ADDED Viewed

Binary file (5.04 kB). View file

packages/ltx-core/src/ltx_core/loader/fuse_loras.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Alexey Kravtsov
+import torch
+import triton
+from ltx_core.loader.kernels import fused_add_round_kernel
+from ltx_core.loader.primitives import LoraStateDictWithStrength, StateDict
+BLOCK_SIZE = 1024
+def fused_add_round_launch(target_weight: torch.Tensor, original_weight: torch.Tensor, seed: int) -> torch.Tensor:
+    if original_weight.dtype == torch.float8_e4m3fn:
+        exponent_bits, mantissa_bits, exponent_bias = 4, 3, 7
+    elif original_weight.dtype == torch.float8_e5m2:
+        exponent_bits, mantissa_bits, exponent_bias = 5, 2, 15  # noqa: F841
+    else:
+        raise ValueError("Unsupported dtype")
+    if target_weight.dtype != torch.bfloat16:
+        raise ValueError("target_weight dtype must be bfloat16")
+    # Calculate grid and block sizes
+    n_elements = original_weight.numel()
+    grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
+    # Launch kernel
+    fused_add_round_kernel[grid](
+        original_weight,
+        target_weight,
+        seed,
+        n_elements,
+        exponent_bias,
+        mantissa_bits,
+        BLOCK_SIZE,
+    )
+    return target_weight
+def calculate_weight_float8_(target_weights: torch.Tensor, original_weights: torch.Tensor) -> torch.Tensor:
+    result = fused_add_round_launch(target_weights, original_weights, seed=0).to(target_weights.dtype)
+    target_weights.copy_(result, non_blocking=True)
+    return target_weights
+def _prepare_deltas(
+    lora_sd_and_strengths: list[LoraStateDictWithStrength], key: str, dtype: torch.dtype, device: torch.device
+) -> torch.Tensor | None:
+    deltas = []
+    prefix = key[: -len(".weight")]
+    key_a = f"{prefix}.lora_A.weight"
+    key_b = f"{prefix}.lora_B.weight"
+    for lsd, coef in lora_sd_and_strengths:
+        if key_a not in lsd.sd or key_b not in lsd.sd:
+            continue
+        product = torch.matmul(lsd.sd[key_b] * coef, lsd.sd[key_a])
+        deltas.append(product.to(dtype=dtype, device=device))
+    if len(deltas) == 0:
+        return None
+    elif len(deltas) == 1:
+        return deltas[0]
+    return torch.sum(torch.stack(deltas, dim=0), dim=0)
+def apply_loras(
+    model_sd: StateDict,
+    lora_sd_and_strengths: list[LoraStateDictWithStrength],
+    dtype: torch.dtype,
+    destination_sd: StateDict | None = None,
+) -> StateDict:
+    sd = {}
+    if destination_sd is not None:
+        sd = destination_sd.sd
+    size = 0
+    device = torch.device("meta")
+    inner_dtypes = set()
+    for key, weight in model_sd.sd.items():
+        if weight is None:
+            continue
+        device = weight.device
+        target_dtype = dtype if dtype is not None else weight.dtype
+        deltas_dtype = target_dtype if target_dtype not in [torch.float8_e4m3fn, torch.float8_e5m2] else torch.bfloat16
+        deltas = _prepare_deltas(lora_sd_and_strengths, key, deltas_dtype, device)
+        if deltas is None:
+            if key in sd:
+                continue
+            deltas = weight.clone().to(dtype=target_dtype, device=device)
+        elif weight.dtype == torch.float8_e4m3fn:
+            if str(device).startswith("cuda"):
+                deltas = calculate_weight_float8_(deltas, weight)
+            else:
+                deltas.add_(weight.to(dtype=deltas.dtype, device=device))
+        elif weight.dtype == torch.bfloat16:
+            deltas.add_(weight)
+        else:
+            raise ValueError(f"Unsupported dtype: {weight.dtype}")
+        sd[key] = deltas.to(dtype=target_dtype)
+        inner_dtypes.add(target_dtype)
+        size += deltas.nbytes
+    if destination_sd is not None:
+        return destination_sd
+    return StateDict(sd, device, size, inner_dtypes)

packages/ltx-core/src/ltx_core/loader/kernels.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# ruff: noqa: ANN001, ANN201, ERA001, N803, N806
+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Alexey Kravtsov
+import triton
+import triton.language as tl
+@triton.jit
+def fused_add_round_kernel(
+    x_ptr,
+    output_ptr,  # contents will be added to the output
+    seed,
+    n_elements,
+    EXPONENT_BIAS,
+    MANTISSA_BITS,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    A kernel to upcast 8bit quantized weights to bfloat16 with stochastic rounding
+    and add them to bfloat16 output weights. Might be used to upcast original model weights
+    and to further add them to precalculated deltas coming from LoRAs.
+    """
+    # Get program ID and compute offsets
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    # Load data
+    x = tl.load(x_ptr + offsets, mask=mask)
+    rand_vals = tl.rand(seed, offsets) - 0.5
+    x = tl.cast(x, tl.float16)
+    delta = tl.load(output_ptr + offsets, mask=mask)
+    delta = tl.cast(delta, tl.float16)
+    x = x + delta
+    x_bits = tl.cast(x, tl.int16, bitcast=True)
+    # Calculate the exponent. Unbiased fp16 exponent is ((x_bits & 0x7C00) >> 10) - 15 for
+    # normal numbers and -14 for subnormals.
+    fp16_exponent_bits = (x_bits & 0x7C00) >> 10
+    fp16_normals = fp16_exponent_bits > 0
+    fp16_exponent = tl.where(fp16_normals, fp16_exponent_bits - 15, -14)
+    # Add the target dtype's exponent bias and clamp to the target dtype's exponent range.
+    exponent = fp16_exponent + EXPONENT_BIAS
+    MAX_EXPONENT = 2 * EXPONENT_BIAS + 1
+    exponent = tl.where(exponent > MAX_EXPONENT, MAX_EXPONENT, exponent)
+    exponent = tl.where(exponent < 0, 0, exponent)
+    # Normal ULP exponent, expressed as an fp16 exponent field:
+    # (exponent - EXPONENT_BIAS - MANTISSA_BITS) + 15
+    # Simplifies to: fp16_exponent - MANTISSA_BITS + 15
+    # See https://en.wikipedia.org/wiki/Unit_in_the_last_place
+    eps_exp = tl.maximum(0, tl.minimum(31, exponent - EXPONENT_BIAS - MANTISSA_BITS + 15))
+    # Calculate epsilon in the target dtype
+    eps_normal = tl.cast(tl.cast(eps_exp << 10, tl.int16), tl.float16, bitcast=True)
+    # Subnormal ULP: 2^(1 - EXPONENT_BIAS - MANTISSA_BITS) ->
+    # fp16 exponent bits: (1 - EXPONENT_BIAS - MANTISSA_BITS) + 15 =
+    # 16 - EXPONENT_BIAS - MANTISSA_BITS
+    eps_subnormal = tl.cast(tl.cast((16 - EXPONENT_BIAS - MANTISSA_BITS) << 10, tl.int16), tl.float16, bitcast=True)
+    eps = tl.where(exponent > 0, eps_normal, eps_subnormal)
+    # Apply zero mask to epsilon
+    eps = tl.where(x == 0, 0.0, eps)
+    # Apply stochastic rounding
+    output = tl.cast(x + rand_vals * eps, tl.bfloat16)
+    # Store the result
+    tl.store(output_ptr + offsets, output, mask=mask)

packages/ltx-core/src/ltx_core/loader/module_ops.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Alexey Kravtsov
+from typing import Callable, NamedTuple
+import torch
+class ModuleOps(NamedTuple):
+    name: str
+    matcher: Callable[[torch.nn.Module], bool]
+    mutator: Callable[[torch.nn.Module], torch.nn.Module]

packages/ltx-core/src/ltx_core/loader/primitives.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Alexey Kravtsov
+from dataclasses import dataclass
+from typing import NamedTuple, Protocol
+import torch
+from ltx_core.loader.module_ops import ModuleOps
+from ltx_core.loader.sd_ops import SDOps
+from ltx_core.model.model_protocol import ModelType
+@dataclass(frozen=True)
+class StateDict:
+    sd: dict
+    device: torch.device
+    size: int
+    dtype: set[torch.dtype]
+    def footprint(self) -> tuple[int, torch.device]:
+        return self.size, self.device
+class StateDictLoader(Protocol):
+    def metadata(self, path: str) -> dict:
+        """
+        Load metadata from path
+        """
+    def load(self, path: str | list[str], sd_ops: SDOps | None = None, device: torch.device | None = None) -> StateDict:
+        """
+        Load state dict from path or paths (for sharded model storage) and apply sd_ops
+        """
+class ModelBuilderProtocol(Protocol[ModelType]):
+    def meta_model(self, config: dict, module_ops: list[ModuleOps] | None = None) -> ModelType: ...
+    def build(self, dtype: torch.dtype | None = None) -> ModelType:
+        """
+        Build the model
+        Args:
+            dtype: Target dtype for the model, if None, uses the dtype of the model_path model
+        Returns:
+            Model instance
+        """
+        ...
+class LoRAAdaptableProtocol(Protocol):
+    def lora(self, lora_path: str, strength: float) -> "LoRAAdaptableProtocol":
+        pass
+class LoraPathStrengthAndSDOps(NamedTuple):
+    path: str
+    strength: float
+    sd_ops: SDOps
+class LoraStateDictWithStrength(NamedTuple):
+    state_dict: StateDict
+    strength: float

packages/ltx-core/src/ltx_core/loader/registry.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Alexey Kravtsov
+import hashlib
+import threading
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Protocol
+from ltx_core.loader.primitives import StateDict
+from ltx_core.loader.sd_ops import SDOps
+class Registry(Protocol):
+    def add(self, paths: list[str], sd_ops: SDOps | None, state_dict: StateDict) -> None: ...
+    def pop(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None: ...
+    def get(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None: ...
+    def clear(self) -> None: ...
+class DummyRegistry(Registry):
+    def add(self, paths: list[str], sd_ops: SDOps | None, state_dict: StateDict) -> None:
+        pass
+    def pop(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None:
+        pass
+    def get(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None:
+        pass
+    def clear(self) -> None:
+        pass
+@dataclass
+class StateDictRegistry(Registry):
+    _state_dicts: dict[str, StateDict] = field(default_factory=dict)
+    _lock: threading.Lock = field(default_factory=threading.Lock)
+    def _generate_id(self, paths: list[str], sd_ops: SDOps) -> str:
+        m = hashlib.sha256()
+        parts = [str(Path(p).resolve()) for p in paths]
+        if sd_ops is not None:
+            parts.append(sd_ops.name)
+        m.update("\0".join(parts).encode("utf-8"))
+        return m.hexdigest()
+    def add(self, paths: list[str], sd_ops: SDOps | None, state_dict: StateDict) -> str:
+        sd_id = self._generate_id(paths, sd_ops)
+        with self._lock:
+            if sd_id in self._state_dicts:
+                raise ValueError(f"State dict retrieved from {paths} with {sd_ops} already added, check with get first")
+            self._state_dicts[sd_id] = state_dict
+        return sd_id
+    def pop(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None:
+        with self._lock:
+            return self._state_dicts.pop(self._generate_id(paths, sd_ops), None)
+    def get(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None:
+        with self._lock:
+            return self._state_dicts.get(self._generate_id(paths, sd_ops), None)
+    def clear(self) -> None:
+        with self._lock:
+            self._state_dicts.clear()

packages/ltx-core/src/ltx_core/loader/sd_ops.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Alexey Kravtsov
+from dataclasses import dataclass, replace
+#from typing import NamedTuple, Protocol, Self
+from typing import NamedTuple, Protocol
+from typing_extensions import Self
+import torch
+@dataclass(frozen=True, slots=True)
+class ContentReplacement:
+    content: str
+    replacement: str
+@dataclass(frozen=True, slots=True)
+class ContentMatching:
+    prefix: str = ""
+    suffix: str = ""
+class KeyValueOperationResult(NamedTuple):
+    new_key: str
+    new_value: torch.Tensor
+class KeyValueOperation(Protocol):
+    def __call__(self, tensor_key: str, tensor_value: torch.Tensor) -> list[KeyValueOperationResult]: ...
+@dataclass(frozen=True, slots=True)
+class SDKeyValueOperation:
+    key_matcher: ContentMatching
+    kv_operation: KeyValueOperation
+@dataclass(frozen=True, slots=True)
+class SDOps:
+    """Immutable class representing state dict key operations."""
+    name: str
+    mapping: tuple[
+        ContentReplacement | ContentMatching | SDKeyValueOperation, ...
+    ] = ()  # Immutable tuple of (key, value) pairs
+    def with_replacement(self, content: str, replacement: str) -> Self:
+        """Create a new SDOps instance with the specified replacement added to the mapping."""
+        new_mapping = (*self.mapping, ContentReplacement(content, replacement))
+        return replace(self, mapping=new_mapping)
+    def with_matching(self, prefix: str = "", suffix: str = "") -> Self:
+        """Create a new SDOps instance with the specified prefix and suffix matching added to the mapping."""
+        new_mapping = (*self.mapping, ContentMatching(prefix, suffix))
+        return replace(self, mapping=new_mapping)
+    def with_kv_operation(
+        self,
+        operation: KeyValueOperation,
+        key_prefix: str = "",
+        key_suffix: str = "",
+    ) -> Self:
+        """Create a new SDOps instance with the specified value operation added to the mapping."""
+        key_matcher = ContentMatching(key_prefix, key_suffix)
+        sd_kv_operation = SDKeyValueOperation(key_matcher, operation)
+        new_mapping = (*self.mapping, sd_kv_operation)
+        return replace(self, mapping=new_mapping)
+    def apply_to_key(self, key: str) -> str | None:
+        """Apply the mapping to the given name."""
+        matchers = [content for content in self.mapping if isinstance(content, ContentMatching)]
+        valid = any(key.startswith(f.prefix) and key.endswith(f.suffix) for f in matchers)
+        if not valid:
+            return None
+        for replacement in self.mapping:
+            if not isinstance(replacement, ContentReplacement):
+                continue
+            if replacement.content in key:
+                key = key.replace(replacement.content, replacement.replacement)
+        return key
+    def apply_to_key_value(self, key: str, value: torch.Tensor) -> list[KeyValueOperationResult]:
+        """Apply the value operation to the given name and associated value."""
+        for operation in self.mapping:
+            if not isinstance(operation, SDKeyValueOperation):
+                continue
+            if key.startswith(operation.key_matcher.prefix) and key.endswith(operation.key_matcher.suffix):
+                return operation.kv_operation(key, value)
+        return [KeyValueOperationResult(key, value)]
+# Predefined SDOps instances
+LTXV_LORA_COMFY_RENAMING_MAP = (
+    SDOps("LTXV_LORA_COMFY_PREFIX_MAP").with_matching().with_replacement("diffusion_model.", "")
+)
+LTXV_LORA_COMFY_TARGET_MAP = (
+    SDOps("LTXV_LORA_COMFY_TARGET_MAP")
+    .with_matching()
+    .with_replacement("diffusion_model.", "")
+    .with_replacement(".lora_A.weight", ".weight")
+    .with_replacement(".lora_B.weight", ".weight")
+)

packages/ltx-core/src/ltx_core/loader/sft_loader.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Alexey Kravtsov
+import json
+import safetensors
+import torch
+from ltx_core.loader.primitives import StateDict, StateDictLoader
+from ltx_core.loader.sd_ops import SDOps
+class SafetensorsStateDictLoader(StateDictLoader):
+    def metadata(self, path: str) -> dict:
+        raise NotImplementedError("Not implemented")
+    def load(self, path: str | list[str], sd_ops: SDOps, device: torch.device | None = None) -> StateDict:
+        """
+        Load state dict from path or paths (for sharded model storage) and apply sd_ops
+        """
+        sd = {}
+        size = 0
+        dtype = set()
+        device = device or torch.device("cpu")
+        model_paths = path if isinstance(path, list) else [path]
+        for shard_path in model_paths:
+            with safetensors.safe_open(shard_path, framework="pt", device=str(device)) as f:
+                safetensor_keys = f.keys()
+                for name in safetensor_keys:
+                    expected_name = name if sd_ops is None else sd_ops.apply_to_key(name)
+                    if expected_name is None:
+                        continue
+                    value = f.get_tensor(name).to(device=device, non_blocking=True, copy=False)
+                    key_value_pairs = ((expected_name, value),)
+                    if sd_ops is not None:
+                        key_value_pairs = sd_ops.apply_to_key_value(expected_name, value)
+                    for key, value in key_value_pairs:
+                        size += value.nbytes
+                        dtype.add(value.dtype)
+                        sd[key] = value
+        return StateDict(sd=sd, device=device, size=size, dtype=dtype)
+class SafetensorsModelStateDictLoader(StateDictLoader):
+    def __init__(self, weight_loader: SafetensorsStateDictLoader | None = None):
+        self.weight_loader = weight_loader if weight_loader is not None else SafetensorsStateDictLoader()
+    def metadata(self, path: str) -> dict:
+        with safetensors.safe_open(path, framework="pt") as f:
+            return json.loads(f.metadata()["config"])
+    def load(self, path: str | list[str], sd_ops: SDOps | None = None, device: torch.device | None = None) -> StateDict:
+        return self.weight_loader.load(path, sd_ops, device)

packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Alexey Kravtsov
+import logging
+from dataclasses import dataclass, field, replace
+from typing import Generic
+import torch
+from ltx_core.loader.fuse_loras import apply_loras
+from ltx_core.loader.module_ops import ModuleOps
+from ltx_core.loader.primitives import (
+    LoRAAdaptableProtocol,
+    LoraPathStrengthAndSDOps,
+    LoraStateDictWithStrength,
+    ModelBuilderProtocol,
+    StateDict,
+    StateDictLoader,
+)
+from ltx_core.loader.registry import DummyRegistry, Registry
+from ltx_core.loader.sd_ops import SDOps
+from ltx_core.loader.sft_loader import SafetensorsModelStateDictLoader
+from ltx_core.model.model_protocol import ModelConfigurator, ModelType
+logger: logging.Logger = logging.getLogger(__name__)
+@dataclass(frozen=True)
+class SingleGPUModelBuilder(Generic[ModelType], ModelBuilderProtocol[ModelType], LoRAAdaptableProtocol):
+    model_class_configurator: type[ModelConfigurator[ModelType]]
+    model_path: str | tuple[str, ...]
+    model_sd_ops: SDOps | None = None
+    module_ops: tuple[ModuleOps, ...] = field(default_factory=tuple)
+    loras: tuple[LoraPathStrengthAndSDOps, ...] = field(default_factory=tuple)
+    model_loader: StateDictLoader = field(default_factory=SafetensorsModelStateDictLoader)
+    registry: Registry = field(default_factory=DummyRegistry)
+    def lora(self, lora_path: str, strength: float = 1.0, sd_ops: SDOps | None = None) -> "SingleGPUModelBuilder":
+        return replace(self, loras=(*self.loras, LoraPathStrengthAndSDOps(lora_path, strength, sd_ops)))
+    def model_config(self) -> dict:
+        first_shard_path = self.model_path[0] if isinstance(self.model_path, tuple) else self.model_path
+        return self.model_loader.metadata(first_shard_path)
+    def meta_model(self, config: dict, module_ops: tuple[ModuleOps, ...]) -> ModelType:
+        with torch.device("meta"):
+            model = self.model_class_configurator.from_config(config)
+        for module_op in module_ops:
+            if module_op.matcher(model):
+                model = module_op.mutator(model)
+        return model
+    def load_sd(
+        self, paths: list[str], registry: Registry, device: torch.device | None, sd_ops: SDOps | None = None
+    ) -> StateDict:
+        state_dict = registry.get(paths, sd_ops)
+        if state_dict is None:
+            state_dict = self.model_loader.load(paths, sd_ops=sd_ops, device=device)
+            registry.add(paths, sd_ops=sd_ops, state_dict=state_dict)
+        return state_dict
+    def _return_model(self, meta_model: ModelType, device: torch.device) -> ModelType:
+        uninitialized_params = [name for name, param in meta_model.named_parameters() if str(param.device) == "meta"]
+        uninitialized_buffers = [name for name, buffer in meta_model.named_buffers() if str(buffer.device) == "meta"]
+        if uninitialized_params or uninitialized_buffers:
+            logger.warning(f"Uninitialized parameters or buffers: {uninitialized_params + uninitialized_buffers}")
+            return meta_model
+        retval = meta_model.to(device)
+        return retval
+    def build(self, device: torch.device | None = None, dtype: torch.dtype | None = None) -> ModelType:
+        device = torch.device("cuda") if device is None else device
+        config = self.model_config()
+        meta_model = self.meta_model(config, self.module_ops)
+        model_paths = self.model_path if isinstance(self.model_path, tuple) else [self.model_path]
+        model_state_dict = self.load_sd(model_paths, sd_ops=self.model_sd_ops, registry=self.registry, device=device)
+        lora_strengths = [lora.strength for lora in self.loras]
+        if not lora_strengths or (min(lora_strengths) == 0 and max(lora_strengths) == 0):
+            sd = model_state_dict.sd
+            if dtype is not None:
+                sd = {key: value.to(dtype=dtype) for key, value in model_state_dict.sd.items()}
+            meta_model.load_state_dict(sd, strict=False, assign=True)
+            return self._return_model(meta_model, device)
+        lora_state_dicts = [
+            self.load_sd([lora.path], sd_ops=lora.sd_ops, registry=self.registry, device=device) for lora in self.loras
+        ]
+        lora_sd_and_strengths = [
+            LoraStateDictWithStrength(sd, strength)
+            for sd, strength in zip(lora_state_dicts, lora_strengths, strict=True)
+        ]
+        final_sd = apply_loras(
+            model_sd=model_state_dict,
+            lora_sd_and_strengths=lora_sd_and_strengths,
+            dtype=dtype,
+            destination_sd=model_state_dict if isinstance(self.registry, DummyRegistry) else None,
+        )
+        meta_model.load_state_dict(final_sd.sd, strict=False, assign=True)
+        return self._return_model(meta_model, device)

packages/ltx-core/src/ltx_core/model/.ipynb_checkpoints/model_ledger-checkpoint.py ADDED Viewed

	@@ -0,0 +1,253 @@

+from dataclasses import replace
+# from typing import Self
+from typing_extensions import Self
+import torch
+from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
+from ltx_core.loader.registry import DummyRegistry, Registry
+from ltx_core.loader.single_gpu_model_builder import SingleGPUModelBuilder as Builder
+from ltx_core.model.audio_vae.audio_vae import Decoder as AudioDecoder
+from ltx_core.model.audio_vae.model_configurator import (
+    AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
+    VOCODER_COMFY_KEYS_FILTER,
+    VocoderConfigurator,
+)
+from ltx_core.model.audio_vae.model_configurator import VAEDecoderConfigurator as AudioDecoderConfigurator
+from ltx_core.model.audio_vae.vocoder import Vocoder
+from ltx_core.model.clip.gemma.encoders.av_encoder import (
+    AV_GEMMA_TEXT_ENCODER_KEY_OPS,
+    AVGemmaTextEncoderModel,
+    AVGemmaTextEncoderModelConfigurator,
+)
+from ltx_core.model.clip.gemma.encoders.base_encoder import module_ops_from_gemma_root
+from ltx_core.model.transformer.model import X0Model
+from ltx_core.model.transformer.model_configurator import (
+    LTXV_MODEL_COMFY_RENAMING_MAP,
+    LTXV_MODEL_COMFY_RENAMING_WITH_TRANSFORMER_LINEAR_DOWNCAST_MAP,
+    UPCAST_DURING_INFERENCE,
+    LTXModelConfigurator,
+)
+from ltx_core.model.upsampler.model import LatentUpsampler
+from ltx_core.model.upsampler.model_configurator import LatentUpsamplerConfigurator
+from ltx_core.model.video_vae.model_configurator import (
+    VAE_DECODER_COMFY_KEYS_FILTER,
+    VAE_ENCODER_COMFY_KEYS_FILTER,
+    VAEDecoderConfigurator,
+    VAEEncoderConfigurator,
+)
+from ltx_core.model.video_vae.video_vae import Decoder as VideoDecoder
+from ltx_core.model.video_vae.video_vae import Encoder as VideoEncoder
+class ModelLedger:
+    """
+    Central coordinator for loading, caching, and freeing models used in an LTX pipeline.
+    The ledger wires together multiple model builders (transformer, video VAE encoder/decoder,
+    audio VAE decoder, vocoder, text encoder, and optional latent upsampler) and exposes
+    the resulting models as lazily constructed, cached attributes.
+    ### Caching behavior
+    Each model attribute (e.g. :attr:`transformer`, :attr:`video_decoder`, :attr:`text_encoder`)
+    is implemented as a :func:`functools.cached_property`. The first time one of these
+    attributes is accessed, the corresponding builder loads weights from the
+    :class:`~ltx_core.loader.registry.StateDictRegistry`, instantiates the model on CPU with
+    the configured ``dtype``, moves it to ``self.device``, and stores the result in
+    the instance ``__dict__``. Subsequent accesses reuse the same model instance until it is
+    explicitly cleared via :meth:`clear_vram`.
+    ### Constructor parameters
+    dtype:
+        Torch dtype used when constructing all models (e.g. ``torch.float16``).
+    device:
+        Target device to which models are moved after construction (e.g. ``torch.device("cuda")``).
+    checkpoint_path:
+        Path to a checkpoint directory or file containing the core model weights
+        (transformer, video VAE, audio VAE, text encoder, vocoder). If ``None``, the
+        corresponding builders are not created and accessing those properties will raise
+        a :class:`ValueError`.
+    gemma_root_path:
+        Base path to Gemma-compatible CLIP/text encoder weights. Required to
+        initialize the text encoder builder; if omitted, :attr:`text_encoder` cannot be used.
+    spatial_upsampler_path:
+        Optional path to a latent upsampler checkpoint. If provided, the
+        :attr:`upsampler` property becomes available; otherwise accessing it raises
+        a :class:`ValueError`.
+    loras:
+        Optional collection of LoRA configurations (paths, strengths, and key operations)
+        that are applied on top of the base transformer weights when building the model.
+    ### Memory management
+    ``clear_ram()``
+        Clears the underlying :class:`Registry` cache of state dicts and triggers a
+        Python garbage collection pass. Use this when you no longer need to construct new
+        models from the currently loaded checkpoints and want to free host (CPU) memory.
+    ``clear_vram()``
+        Drops the cached model instances stored by the ``@cached_property`` attributes from
+        this ledger (by removing them from ``self.__dict__``) and calls
+        :func:`torch.cuda.empty_cache`. Use this when you want to release GPU memory;
+        subsequent access to a model property will rebuild the model from the registry
+        while keeping the existing builder configuration.
+    """
+    def __init__(
+        self,
+        dtype: torch.dtype,
+        device: torch.device,
+        checkpoint_path: str | None = None,
+        gemma_root_path: str | None = None,
+        spatial_upsampler_path: str | None = None,
+        loras: LoraPathStrengthAndSDOps | None = None,
+        registry: Registry | None = None,
+        fp8transformer: bool = False,
+        local_files_only: bool = True
+    ):
+        self.dtype = dtype
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self.gemma_root_path = gemma_root_path
+        self.spatial_upsampler_path = spatial_upsampler_path
+        self.loras = loras or ()
+        self.registry = registry or DummyRegistry()
+        self.fp8transformer = fp8transformer
+        self.local_files_only = local_files_only
+        self.build_model_builders()
+    def build_model_builders(self) -> None:
+        if self.checkpoint_path is not None:
+            self.transformer_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=LTXModelConfigurator,
+                model_sd_ops=LTXV_MODEL_COMFY_RENAMING_MAP,
+                loras=tuple(self.loras),
+                registry=self.registry,
+            )
+            self.vae_decoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=VAEDecoderConfigurator,
+                model_sd_ops=VAE_DECODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+            self.vae_encoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=VAEEncoderConfigurator,
+                model_sd_ops=VAE_ENCODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+            self.audio_decoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=AudioDecoderConfigurator,
+                model_sd_ops=AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+            self.vocoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=VocoderConfigurator,
+                model_sd_ops=VOCODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+            if self.gemma_root_path is not None:
+                self.text_encoder_builder = Builder(
+                    model_path=self.checkpoint_path,
+                    model_class_configurator=AVGemmaTextEncoderModelConfigurator,
+                    model_sd_ops=AV_GEMMA_TEXT_ENCODER_KEY_OPS,
+                    registry=self.registry,
+                    module_ops=module_ops_from_gemma_root(self.gemma_root_path, self.local_files_only),
+                )
+        if self.spatial_upsampler_path is not None:
+            self.upsampler_builder = Builder(
+                model_path=self.spatial_upsampler_path,
+                model_class_configurator=LatentUpsamplerConfigurator,
+                registry=self.registry,
+            )
+    def _target_device(self) -> torch.device:
+        if isinstance(self.registry, DummyRegistry) or self.registry is None:
+            return self.device
+        else:
+            return torch.device("cpu")
+    def with_loras(self, loras: LoraPathStrengthAndSDOps) -> Self:
+        return ModelLedger(
+            dtype=self.dtype,
+            device=self.device,
+            checkpoint_path=self.checkpoint_path,
+            gemma_root_path=self.gemma_root_path,
+            spatial_upsampler_path=self.spatial_upsampler_path,
+            loras=(*self.loras, *loras),
+            registry=self.registry,
+            fp8transformer=self.fp8transformer,
+        )
+    def transformer(self) -> X0Model:
+        if not hasattr(self, "transformer_builder"):
+            raise ValueError(
+                "Transformer not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        if self.fp8transformer:
+            fp8_builder = replace(
+                self.transformer_builder,
+                module_ops=(UPCAST_DURING_INFERENCE,),
+                model_sd_ops=LTXV_MODEL_COMFY_RENAMING_WITH_TRANSFORMER_LINEAR_DOWNCAST_MAP,
+            )
+            return X0Model(fp8_builder.build(device=self._target_device())).to(self.device)
+        else:
+            return X0Model(self.transformer_builder.build(device=self._target_device(), dtype=self.dtype)).to(
+                self.device
+            )
+    def video_decoder(self) -> VideoDecoder:
+        if not hasattr(self, "vae_decoder_builder"):
+            raise ValueError(
+                "Video decoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.vae_decoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device)
+    def video_encoder(self) -> VideoEncoder:
+        if not hasattr(self, "vae_encoder_builder"):
+            raise ValueError(
+                "Video encoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.vae_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device)
+    def text_encoder(self) -> AVGemmaTextEncoderModel:
+        if not hasattr(self, "text_encoder_builder"):
+            raise ValueError(
+                "Text encoder not initialized. Please provide a checkpoint path and gemma root path to the "
+                "ModelLedger constructor."
+            )
+        return self.text_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device)
+    def audio_decoder(self) -> AudioDecoder:
+        if not hasattr(self, "audio_decoder_builder"):
+            raise ValueError(
+                "Audio decoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.audio_decoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device)
+    def vocoder(self) -> Vocoder:
+        if not hasattr(self, "vocoder_builder"):
+            raise ValueError(
+                "Vocoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.vocoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device)
+    def spatial_upsampler(self) -> LatentUpsampler:
+        if not hasattr(self, "upsampler_builder"):
+            raise ValueError("Upsampler not initialized. Please provide upsampler path to the ModelLedger constructor.")
+        return self.upsampler_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device)

packages/ltx-core/src/ltx_core/model/__init__.py ADDED Viewed

File without changes

packages/ltx-core/src/ltx_core/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (182 Bytes). View file

packages/ltx-core/src/ltx_core/model/__pycache__/model_ledger.cpython-310.pyc ADDED Viewed

Binary file (9.16 kB). View file

packages/ltx-core/src/ltx_core/model/__pycache__/model_protocol.cpython-310.pyc ADDED Viewed

Binary file (744 Bytes). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__init__.py ADDED Viewed

File without changes

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (192 Bytes). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (2.37 kB). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/audio_vae.cpython-310.pyc ADDED Viewed

Binary file (13.6 kB). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/causal_conv_2d.cpython-310.pyc ADDED Viewed

Binary file (3.28 kB). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/causality_axis.cpython-310.pyc ADDED Viewed

Binary file (574 Bytes). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/downsample.cpython-310.pyc ADDED Viewed

Binary file (3.18 kB). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/model_configurator.cpython-310.pyc ADDED Viewed

Binary file (4.41 kB). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/ops.cpython-310.pyc ADDED Viewed

Binary file (3 kB). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/resnet.cpython-310.pyc ADDED Viewed

Binary file (4.06 kB). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/upsample.cpython-310.pyc ADDED Viewed

Binary file (2.88 kB). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__pycache__/vocoder.cpython-310.pyc ADDED Viewed

Binary file (4.89 kB). View file

packages/ltx-core/src/ltx_core/model/audio_vae/attention.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from enum import Enum
+import torch
+from ltx_core.model.common.normalization import NormType, build_normalization_layer
+class AttentionType(Enum):
+    """Enum for specifying the attention mechanism type."""
+    VANILLA = "vanilla"
+    LINEAR = "linear"
+    NONE = "none"
+class AttnBlock(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        norm_type: NormType = NormType.GROUP,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = build_normalization_layer(in_channels, normtype=norm_type)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w).contiguous()
+        q = q.permute(0, 2, 1).contiguous()  # b,hw,c
+        k = k.reshape(b, c, h * w).contiguous()  # b,c,hw
+        w_ = torch.bmm(q, k).contiguous()  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w).contiguous()
+        w_ = w_.permute(0, 2, 1).contiguous()  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_).contiguous()  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w).contiguous()
+        h_ = self.proj_out(h_)
+        return x + h_
+def make_attn(
+    in_channels: int,
+    attn_type: AttentionType = AttentionType.VANILLA,
+    norm_type: NormType = NormType.GROUP,
+) -> torch.nn.Module:
+    match attn_type:
+        case AttentionType.VANILLA:
+            return AttnBlock(in_channels, norm_type=norm_type)
+        case AttentionType.NONE:
+            return torch.nn.Identity()
+        case AttentionType.LINEAR:
+            raise NotImplementedError(f"Attention type {attn_type.value} is not supported yet.")
+        case _:
+            raise ValueError(f"Unknown attention type: {attn_type}")

packages/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# Copyright (c) 2025 Lightricks. All rights reserved.
+# Created by Ivan Zorin
+from typing import Set, Tuple
+import torch
+import torch.nn.functional as F
+from ltx_core.model.audio_vae.attention import AttentionType, make_attn
+from ltx_core.model.audio_vae.causal_conv_2d import make_conv2d
+from ltx_core.model.audio_vae.causality_axis import CausalityAxis
+from ltx_core.model.audio_vae.downsample import build_downsampling_path
+from ltx_core.model.audio_vae.ops import PerChannelStatistics
+from ltx_core.model.audio_vae.resnet import ResnetBlock
+from ltx_core.model.audio_vae.upsample import build_upsampling_path
+from ltx_core.model.common.normalization import NormType, build_normalization_layer
+from ltx_core.pipeline.components.patchifiers import AudioPatchifier
+from ltx_core.pipeline.components.protocols import AudioLatentShape
+LATENT_DOWNSAMPLE_FACTOR = 4
+def build_mid_block(
+    channels: int,
+    temb_channels: int,
+    dropout: float,
+    norm_type: NormType,
+    causality_axis: CausalityAxis,
+    attn_type: AttentionType,
+    add_attention: bool,
+) -> torch.nn.Module:
+    """Build the middle block with two ResNet blocks and optional attention."""
+    mid = torch.nn.Module()
+    mid.block_1 = ResnetBlock(
+        in_channels=channels,
+        out_channels=channels,
+        temb_channels=temb_channels,
+        dropout=dropout,
+        norm_type=norm_type,
+        causality_axis=causality_axis,
+    )
+    mid.attn_1 = make_attn(channels, attn_type=attn_type, norm_type=norm_type) if add_attention else torch.nn.Identity()
+    mid.block_2 = ResnetBlock(
+        in_channels=channels,
+        out_channels=channels,
+        temb_channels=temb_channels,
+        dropout=dropout,
+        norm_type=norm_type,
+        causality_axis=causality_axis,
+    )
+    return mid
+def run_mid_block(mid: torch.nn.Module, features: torch.Tensor) -> torch.Tensor:
+    """Run features through the middle block."""
+    features = mid.block_1(features, temb=None)
+    features = mid.attn_1(features)
+    return mid.block_2(features, temb=None)
+class Encoder(torch.nn.Module):
+    """
+    Encoder that compresses audio spectrograms into latent representations.
+    The encoder uses a series of downsampling blocks with residual connections,
+    attention mechanisms, and configurable causal convolutions.
+    """
+    def __init__(  # noqa: PLR0913
+        self,
+        *,
+        ch: int,
+        ch_mult: Tuple[int, ...] = (1, 2, 4, 8),
+        num_res_blocks: int,
+        attn_resolutions: Set[int],
+        dropout: float = 0.0,
+        resamp_with_conv: bool = True,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+        double_z: bool = True,
+        attn_type: AttentionType = AttentionType.VANILLA,
+        mid_block_add_attention: bool = True,
+        norm_type: NormType = NormType.GROUP,
+        causality_axis: CausalityAxis = CausalityAxis.WIDTH,
+        sample_rate: int = 16000,
+        mel_hop_length: int = 160,
+        n_fft: int = 1024,
+        is_causal: bool = True,
+        mel_bins: int = 64,
+        **_ignore_kwargs,
+    ) -> None:
+        """
+        Initialize the Encoder.
+        Args:
+            Arguments are configuration parameters, loaded from the audio VAE checkpoint config
+            (audio_vae.model.params.ddconfig):
+            ch: Base number of feature channels used in the first convolution layer.
+            ch_mult: Multiplicative factors for the number of channels at each resolution level.
+            num_res_blocks: Number of residual blocks to use at each resolution level.
+            attn_resolutions: Spatial resolutions (e.g., in time/frequency) at which to apply attention.
+            resolution: Input spatial resolution of the spectrogram (height, width).
+            z_channels: Number of channels in the latent representation.
+            norm_type: Normalization layer type to use within the network (e.g., group, batch).
+            causality_axis: Axis along which convolutions should be causal (e.g., time axis).
+            sample_rate: Audio sample rate in Hz for the input signals.
+            mel_hop_length: Hop length used when computing the mel spectrogram.
+            n_fft: FFT size used to compute the spectrogram.
+            mel_bins: Number of mel-frequency bins in the input spectrogram.
+            in_channels: Number of channels in the input spectrogram tensor.
+            double_z: If True, predict both mean and log-variance (doubling latent channels).
+            is_causal: If True, use causal convolutions suitable for streaming setups.
+            dropout: Dropout probability used in residual and mid blocks.
+            attn_type: Type of attention mechanism to use in attention blocks.
+            resamp_with_conv: If True, perform resolution changes using strided convolutions.
+            mid_block_add_attention: If True, add an attention block in the mid-level of the encoder.
+        """
+        super().__init__()
+        self.per_channel_statistics = PerChannelStatistics(latent_channels=ch)
+        self.sample_rate = sample_rate
+        self.mel_hop_length = mel_hop_length
+        self.n_fft = n_fft
+        self.is_causal = is_causal
+        self.mel_bins = mel_bins
+        self.patchifier = AudioPatchifier(
+            patch_size=1,
+            audio_latent_downsample_factor=LATENT_DOWNSAMPLE_FACTOR,
+            sample_rate=sample_rate,
+            hop_length=mel_hop_length,
+            is_causal=is_causal,
+        )
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.z_channels = z_channels
+        self.double_z = double_z
+        self.norm_type = norm_type
+        self.causality_axis = causality_axis
+        self.attn_type = attn_type
+        # downsampling
+        self.conv_in = make_conv2d(
+            in_channels,
+            self.ch,
+            kernel_size=3,
+            stride=1,
+            causality_axis=self.causality_axis,
+        )
+        self.non_linearity = torch.nn.SiLU()
+        self.down, block_in = build_downsampling_path(
+            ch=ch,
+            ch_mult=ch_mult,
+            num_resolutions=self.num_resolutions,
+            num_res_blocks=num_res_blocks,
+            resolution=resolution,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            attn_resolutions=attn_resolutions,
+            resamp_with_conv=resamp_with_conv,
+        )
+        self.mid = build_mid_block(
+            channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            add_attention=mid_block_add_attention,
+        )
+        self.norm_out = build_normalization_layer(block_in, normtype=self.norm_type)
+        self.conv_out = make_conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            causality_axis=self.causality_axis,
+        )
+    def forward(self, spectrogram: torch.Tensor) -> torch.Tensor:
+        """
+        Encode audio spectrogram into latent representations.
+        Args:
+            spectrogram: Input spectrogram of shape (batch, channels, time, frequency)
+        Returns:
+            Encoded latent representation of shape (batch, channels, frames, mel_bins)
+        """
+        h = self.conv_in(spectrogram)
+        h = self._run_downsampling_path(h)
+        h = run_mid_block(self.mid, h)
+        h = self._finalize_output(h)
+        return self._normalize_latents(h)
+    def _run_downsampling_path(self, h: torch.Tensor) -> torch.Tensor:
+        for level in range(self.num_resolutions):
+            stage = self.down[level]
+            for block_idx in range(self.num_res_blocks):
+                h = stage.block[block_idx](h, temb=None)
+                if stage.attn:
+                    h = stage.attn[block_idx](h)
+            if level != self.num_resolutions - 1:
+                h = stage.downsample(h)
+        return h
+    def _finalize_output(self, h: torch.Tensor) -> torch.Tensor:
+        h = self.norm_out(h)
+        h = self.non_linearity(h)
+        return self.conv_out(h)
+    def _normalize_latents(self, latent_output: torch.Tensor) -> torch.Tensor:
+        """
+        Normalize encoder latents using per-channel statistics.
+        When the encoder is configured with ``double_z=True``, the final
+        convolution produces twice the number of latent channels, typically
+        interpreted as two concatenated tensors along the channel dimension
+        (e.g., mean and variance or other auxiliary parameters).
+        This method intentionally uses only the first half of the channels
+        (the "mean" component) as input to the patchifier and normalization
+        logic. The remaining channels are left unchanged by this method and
+        are expected to be consumed elsewhere in the VAE pipeline.
+        If ``double_z=False``, the encoder output already contains only the
+        mean latents and the chunking operation simply returns that tensor.
+        """
+        means = torch.chunk(latent_output, 2, dim=1)[0]
+        latent_shape = AudioLatentShape(
+            batch=means.shape[0],
+            channels=means.shape[1],
+            frames=means.shape[2],
+            mel_bins=means.shape[3],
+        )
+        latent_patched = self.patchifier.patchify(means)
+        latent_normalized = self.per_channel_statistics.normalize(latent_patched)
+        return self.patchifier.unpatchify(latent_normalized, latent_shape)
+class Decoder(torch.nn.Module):
+    """
+    Symmetric decoder that reconstructs audio spectrograms from latent features.
+    The decoder mirrors the encoder structure with configurable channel multipliers,
+    attention resolutions, and causal convolutions.
+    """
+    def __init__(  # noqa: PLR0913
+        self,
+        *,
+        ch: int,
+        out_ch: int,
+        ch_mult: Tuple[int, ...] = (1, 2, 4, 8),
+        num_res_blocks: int,
+        attn_resolutions: Set[int],
+        resolution: int,
+        z_channels: int,
+        norm_type: NormType = NormType.GROUP,
+        causality_axis: CausalityAxis = CausalityAxis.WIDTH,
+        dropout: float = 0.0,
+        mid_block_add_attention: bool = True,
+        sample_rate: int = 16000,
+        mel_hop_length: int = 160,
+        is_causal: bool = True,
+        mel_bins: int | None = None,
+    ) -> None:
+        """
+        Initialize the Decoder.
+        Args:
+            Arguments are configuration parameters, loaded from the audio VAE checkpoint config
+            (audio_vae.model.params.ddconfig):
+            - ch, out_ch, ch_mult, num_res_blocks, attn_resolutions
+            - resolution, z_channels
+            - norm_type, causality_axis
+        """
+        super().__init__()
+        # Internal behavioural defaults that are not driven by the checkpoint.
+        resamp_with_conv = True
+        attn_type = AttentionType.VANILLA
+        # Per-channel statistics for denormalizing latents
+        self.per_channel_statistics = PerChannelStatistics(latent_channels=ch)
+        self.sample_rate = sample_rate
+        self.mel_hop_length = mel_hop_length
+        self.is_causal = is_causal
+        self.mel_bins = mel_bins
+        self.patchifier = AudioPatchifier(
+            patch_size=1,
+            audio_latent_downsample_factor=LATENT_DOWNSAMPLE_FACTOR,
+            sample_rate=sample_rate,
+            hop_length=mel_hop_length,
+            is_causal=is_causal,
+        )
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.out_ch = out_ch
+        self.give_pre_end = False
+        self.tanh_out = False
+        self.norm_type = norm_type
+        self.z_channels = z_channels
+        self.channel_multipliers = ch_mult
+        self.attn_resolutions = attn_resolutions
+        self.causality_axis = causality_axis
+        self.attn_type = attn_type
+        base_block_channels = ch * self.channel_multipliers[-1]
+        base_resolution = resolution // (2 ** (self.num_resolutions - 1))
+        self.z_shape = (1, z_channels, base_resolution, base_resolution)
+        self.conv_in = make_conv2d(
+            z_channels, base_block_channels, kernel_size=3, stride=1, causality_axis=self.causality_axis
+        )
+        self.non_linearity = torch.nn.SiLU()
+        self.mid = build_mid_block(
+            channels=base_block_channels,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            add_attention=mid_block_add_attention,
+        )
+        self.up, final_block_channels = build_upsampling_path(
+            ch=ch,
+            ch_mult=ch_mult,
+            num_resolutions=self.num_resolutions,
+            num_res_blocks=num_res_blocks,
+            resolution=resolution,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            attn_resolutions=attn_resolutions,
+            resamp_with_conv=resamp_with_conv,
+            initial_block_channels=base_block_channels,
+        )
+        self.norm_out = build_normalization_layer(final_block_channels, normtype=self.norm_type)
+        self.conv_out = make_conv2d(
+            final_block_channels, out_ch, kernel_size=3, stride=1, causality_axis=self.causality_axis
+        )
+    def forward(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        Decode latent features back to audio spectrograms.
+        Args:
+            sample: Encoded latent representation of shape (batch, channels, frames, mel_bins)
+        Returns:
+            Reconstructed audio spectrogram of shape (batch, channels, time, frequency)
+        """
+        sample, target_shape = self._denormalize_latents(sample)
+        h = self.conv_in(sample)
+        h = run_mid_block(self.mid, h)
+        h = self._run_upsampling_path(h)
+        h = self._finalize_output(h)
+        return self._adjust_output_shape(h, target_shape)
+    def _denormalize_latents(self, sample: torch.Tensor) -> tuple[torch.Tensor, AudioLatentShape]:
+        latent_shape = AudioLatentShape(
+            batch=sample.shape[0],
+            channels=sample.shape[1],
+            frames=sample.shape[2],
+            mel_bins=sample.shape[3],
+        )
+        sample_patched = self.patchifier.patchify(sample)
+        sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched)
+        sample = self.patchifier.unpatchify(sample_denormalized, latent_shape)
+        target_frames = latent_shape.frames * LATENT_DOWNSAMPLE_FACTOR
+        if self.causality_axis != CausalityAxis.NONE:
+            target_frames = max(target_frames - (LATENT_DOWNSAMPLE_FACTOR - 1), 1)
+        target_shape = AudioLatentShape(
+            batch=latent_shape.batch,
+            channels=self.out_ch,
+            frames=target_frames,
+            mel_bins=self.mel_bins if self.mel_bins is not None else latent_shape.mel_bins,
+        )
+        return sample, target_shape
+    def _adjust_output_shape(
+        self,
+        decoded_output: torch.Tensor,
+        target_shape: AudioLatentShape,
+    ) -> torch.Tensor:
+        """
+        Adjust output shape to match target dimensions for variable-length audio.
+        This function handles the common case where decoded audio spectrograms need to be
+        resized to match a specific target shape.
+        Args:
+            decoded_output: Tensor of shape (batch, channels, time, frequency)
+            target_shape: AudioLatentShape describing (batch, channels, time, mel bins)
+        Returns:
+            Tensor adjusted to match target_shape exactly
+        """
+        # Current output shape: (batch, channels, time, frequency)
+        _, _, current_time, current_freq = decoded_output.shape
+        target_channels = target_shape.channels
+        target_time = target_shape.frames
+        target_freq = target_shape.mel_bins
+        # Step 1: Crop first to avoid exceeding target dimensions
+        decoded_output = decoded_output[
+            :, :target_channels, : min(current_time, target_time), : min(current_freq, target_freq)
+        ]
+        # Step 2: Calculate padding needed for time and frequency dimensions
+        time_padding_needed = target_time - decoded_output.shape[2]
+        freq_padding_needed = target_freq - decoded_output.shape[3]
+        # Step 3: Apply padding if needed
+        if time_padding_needed > 0 or freq_padding_needed > 0:
+            # PyTorch padding format: (pad_left, pad_right, pad_top, pad_bottom)
+            # For audio: pad_left/right = frequency, pad_top/bottom = time
+            padding = (
+                0,
+                max(freq_padding_needed, 0),  # frequency padding (left, right)
+                0,
+                max(time_padding_needed, 0),  # time padding (top, bottom)
+            )
+            decoded_output = F.pad(decoded_output, padding)
+        # Step 4: Final safety crop to ensure exact target shape
+        decoded_output = decoded_output[:, :target_channels, :target_time, :target_freq]
+        return decoded_output
+    def _run_upsampling_path(self, h: torch.Tensor) -> torch.Tensor:
+        for level in reversed(range(self.num_resolutions)):
+            stage = self.up[level]
+            for block_idx, block in enumerate(stage.block):
+                h = block(h, temb=None)
+                if stage.attn:
+                    h = stage.attn[block_idx](h)
+            if level != 0 and hasattr(stage, "upsample"):
+                h = stage.upsample(h)
+        return h
+    def _finalize_output(self, h: torch.Tensor) -> torch.Tensor:
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = self.non_linearity(h)
+        h = self.conv_out(h)
+        return torch.tanh(h) if self.tanh_out else h