SUPIR

Sleeping

App Files Files Community

Fabrice-TIERCELIN commited on 26 days ago

Commit

4f5fe16

verified ·

1 Parent(s): 2d24b22

Upload 6 files

Browse files

Files changed (6) hide show

packages/ltx-pipelines/src/ltx_pipelines/utils/args.py +277 -0
packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py +77 -0
packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py +507 -0
packages/ltx-pipelines/src/ltx_pipelines/utils/media_io.py +299 -0
packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py +275 -0
packages/ltx-pipelines/src/ltx_pipelines/utils/types.py +73 -0

packages/ltx-pipelines/src/ltx_pipelines/utils/args.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import argparse
+from pathlib import Path
+from ltx_core.loader import LTXV_LORA_COMFY_RENAMING_MAP, LoraPathStrengthAndSDOps
+from ltx_pipelines.utils.constants import (
+    DEFAULT_1_STAGE_HEIGHT,
+    DEFAULT_1_STAGE_WIDTH,
+    DEFAULT_2_STAGE_HEIGHT,
+    DEFAULT_2_STAGE_WIDTH,
+    DEFAULT_CFG_GUIDANCE_SCALE,
+    DEFAULT_FRAME_RATE,
+    DEFAULT_LORA_STRENGTH,
+    DEFAULT_NEGATIVE_PROMPT,
+    DEFAULT_NUM_FRAMES,
+    DEFAULT_NUM_INFERENCE_STEPS,
+    DEFAULT_SEED,
+)
+class VideoConditioningAction(argparse.Action):
+    def __call__(
+        self,
+        parser: argparse.ArgumentParser,  # noqa: ARG002
+        namespace: argparse.Namespace,
+        values: list[str],
+        option_string: str | None = None,  # noqa: ARG002
+    ) -> None:
+        path, strength_str = values
+        resolved_path = resolve_path(path)
+        strength = float(strength_str)
+        current = getattr(namespace, self.dest) or []
+        current.append((resolved_path, strength))
+        setattr(namespace, self.dest, current)
+class ImageAction(argparse.Action):
+    def __call__(
+        self,
+        parser: argparse.ArgumentParser,  # noqa: ARG002
+        namespace: argparse.Namespace,
+        values: list[str],
+        option_string: str | None = None,  # noqa: ARG002
+    ) -> None:
+        path, frame_idx, strength_str = values
+        resolved_path = resolve_path(path)
+        frame_idx = int(frame_idx)
+        strength = float(strength_str)
+        current = getattr(namespace, self.dest) or []
+        current.append((resolved_path, frame_idx, strength))
+        setattr(namespace, self.dest, current)
+class LoraAction(argparse.Action):
+    def __call__(
+        self,
+        parser: argparse.ArgumentParser,  # noqa: ARG002
+        namespace: argparse.Namespace,
+        values: list[str],
+        option_string: str | None = None,
+    ) -> None:
+        if len(values) > 2:
+            msg = f"{option_string} accepts at most 2 arguments (PATH and optional STRENGTH), got {len(values)} values"
+            raise argparse.ArgumentError(self, msg)
+        path = values[0]
+        strength_str = values[1] if len(values) > 1 else str(DEFAULT_LORA_STRENGTH)
+        resolved_path = resolve_path(path)
+        strength = float(strength_str)
+        current = getattr(namespace, self.dest) or []
+        current.append(LoraPathStrengthAndSDOps(resolved_path, strength, LTXV_LORA_COMFY_RENAMING_MAP))
+        setattr(namespace, self.dest, current)
+def resolve_path(path: str) -> str:
+    return str(Path(path).expanduser().resolve().as_posix())
+def basic_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--checkpoint-path",
+        type=resolve_path,
+        required=True,
+        help="Path to LTX-2 model checkpoint (.safetensors file).",
+    )
+    parser.add_argument(
+        "--gemma-root",
+        type=resolve_path,
+        required=True,
+        help="Path to the root directory containing the Gemma text encoder model files.",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        required=True,
+        help="Text prompt describing the desired video content to be generated by the model.",
+    )
+    parser.add_argument(
+        "--output-path",
+        type=resolve_path,
+        required=True,
+        help="Path to the output video file (MP4 format).",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=DEFAULT_SEED,
+        help=(
+            f"Random seed value used to initialize the noise tensor for "
+            f"reproducible generation (default: {DEFAULT_SEED})."
+        ),
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=DEFAULT_1_STAGE_HEIGHT,
+        help=f"Height of the generated video in pixels, should be divisible by 32 (default: {DEFAULT_1_STAGE_HEIGHT}).",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=DEFAULT_1_STAGE_WIDTH,
+        help=f"Width of the generated video in pixels, should be divisible by 32 (default: {DEFAULT_1_STAGE_WIDTH}).",
+    )
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=DEFAULT_NUM_FRAMES,
+        help=f"Number of frames to generate in the output video sequence, num-frames = (8 x K) + 1, "
+        f"where k is a non-negative integer (default: {DEFAULT_NUM_FRAMES}).",
+    )
+    parser.add_argument(
+        "--frame-rate",
+        type=float,
+        default=DEFAULT_FRAME_RATE,
+        help=f"Frame rate of the generated video (fps) (default: {DEFAULT_FRAME_RATE}).",
+    )
+    parser.add_argument(
+        "--num-inference-steps",
+        type=int,
+        default=DEFAULT_NUM_INFERENCE_STEPS,
+        help=(
+            f"Number of denoising steps in the diffusion sampling process. "
+            f"Higher values improve quality but increase generation time (default: {DEFAULT_NUM_INFERENCE_STEPS})."
+        ),
+    )
+    parser.add_argument(
+        "--image",
+        dest="images",
+        action=ImageAction,
+        nargs=3,
+        metavar=("PATH", "FRAME_IDX", "STRENGTH"),
+        default=[],
+        help=(
+            "Image conditioning input: path to image file, target frame index, "
+            "and conditioning strength (all three required). Default: empty list [] (no image conditioning). "
+            "Can be specified multiple times. Example: --image path/to/image1.jpg 0 0.8 "
+            "--image path/to/image2.jpg 160 0.9"
+        ),
+    )
+    parser.add_argument(
+        "--lora",
+        dest="lora",
+        action=LoraAction,
+        nargs="+",  # Accept 1-2 arguments per use (path and optional strength); validation is handled in LoraAction
+        metavar=("PATH", "STRENGTH"),
+        default=[],
+        help=(
+            "LoRA (Low-Rank Adaptation) model: path to model file and optional strength "
+            f"(default strength: {DEFAULT_LORA_STRENGTH}). Can be specified multiple times. "
+            "Example: --lora path/to/lora1.safetensors 0.8 --lora path/to/lora2.safetensors"
+        ),
+    )
+    parser.add_argument(
+        "--enable-fp8",
+        action="store_true",
+        help="Enable FP8 mode to reduce memory footprint by keeping model in lower precision. "
+        "Note that calculations are still performed in bfloat16 precision.",
+    )
+    parser.add_argument("--enhance-prompt", action="store_true")
+    return parser
+def default_1_stage_arg_parser() -> argparse.ArgumentParser:
+    parser = basic_arg_parser()
+    parser.add_argument(
+        "--cfg-guidance-scale",
+        type=float,
+        default=DEFAULT_CFG_GUIDANCE_SCALE,
+        help=(
+            f"Classifier-free guidance (CFG) scale controlling how strongly "
+            f"the model adheres to the prompt. Higher values increase prompt "
+            f"adherence but may reduce diversity (default: {DEFAULT_CFG_GUIDANCE_SCALE})."
+        ),
+    )
+    parser.add_argument(
+        "--negative-prompt",
+        type=str,
+        default=DEFAULT_NEGATIVE_PROMPT,
+        help=(
+            "Negative prompt describing what should not appear in the generated video, "
+            "used to guide the diffusion process away from unwanted content. "
+            "Default: a comprehensive negative prompt covering common artifacts and quality issues."
+        ),
+    )
+    return parser
+def default_2_stage_arg_parser() -> argparse.ArgumentParser:
+    parser = default_1_stage_arg_parser()
+    parser.set_defaults(height=DEFAULT_2_STAGE_HEIGHT, width=DEFAULT_2_STAGE_WIDTH)
+    # Update help text to reflect 2-stage defaults
+    for action in parser._actions:
+        if "--height" in action.option_strings:
+            action.help = (
+                f"Height of the generated video in pixels, should be divisible by 64 "
+                f"(default: {DEFAULT_2_STAGE_HEIGHT})."
+            )
+        if "--width" in action.option_strings:
+            action.help = (
+                f"Width of the generated video in pixels, should be divisible by 64 (default: {DEFAULT_2_STAGE_WIDTH})."
+            )
+    parser.add_argument(
+        "--distilled-lora",
+        dest="distilled_lora",
+        action=LoraAction,
+        nargs="+",  # Accept 1-2 arguments per use (path and optional strength); validation is handled in LoraAction
+        metavar=("PATH", "STRENGTH"),
+        required=True,
+        help=(
+            "Distilled LoRA (Low-Rank Adaptation) model used in the second stage (upscaling and refinement): "
+            f"path to model file and optional strength (default strength: {DEFAULT_LORA_STRENGTH}). "
+            "The second stage upsamples the video by 2x resolution and refines it using a distilled "
+            "denoising schedule (fewer steps, no CFG). The distilled LoRA is specifically trained "
+            "for this refinement process to improve quality at higher resolutions. "
+            "Example: --distilled-lora path/to/distilled_lora.safetensors 0.8"
+        ),
+    )
+    parser.add_argument(
+        "--spatial-upsampler-path",
+        type=resolve_path,
+        required=True,
+        help=(
+            "Path to the spatial upsampler model used to increase the resolution "
+            "of the generated video in the latent space."
+        ),
+    )
+    return parser
+def default_2_stage_distilled_arg_parser() -> argparse.ArgumentParser:
+    parser = basic_arg_parser()
+    parser.set_defaults(height=DEFAULT_2_STAGE_HEIGHT, width=DEFAULT_2_STAGE_WIDTH)
+    # Update help text to reflect 2-stage defaults
+    for action in parser._actions:
+        if "--height" in action.option_strings:
+            action.help = (
+                f"Height of the generated video in pixels, should be divisible by 64 "
+                f"(default: {DEFAULT_2_STAGE_HEIGHT})."
+            )
+        if "--width" in action.option_strings:
+            action.help = (
+                f"Width of the generated video in pixels, should be divisible by 64 (default: {DEFAULT_2_STAGE_WIDTH})."
+            )
+    parser.add_argument(
+        "--spatial-upsampler-path",
+        type=resolve_path,
+        required=True,
+        help=(
+            "Path to the spatial upsampler model used to increase the resolution "
+            "of the generated video in the latent space."
+        ),
+    )
+    return parser

packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# =============================================================================
+# Diffusion Schedule
+# =============================================================================
+# Noise schedule for the distilled pipeline. These sigma values control noise
+# levels at each denoising step and were tuned to match the distillation process.
+from ltx_core.types import SpatioTemporalScaleFactors
+DISTILLED_SIGMA_VALUES = [1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0]
+# Reduced schedule for super-resolution stage 2 (subset of distilled values)
+STAGE_2_DISTILLED_SIGMA_VALUES = [0.909375, 0.725, 0.421875, 0.0]
+# =============================================================================
+# Video Generation Defaults
+# =============================================================================
+DEFAULT_SEED = 10
+DEFAULT_1_STAGE_HEIGHT = 512
+DEFAULT_1_STAGE_WIDTH = 768
+DEFAULT_2_STAGE_HEIGHT = DEFAULT_1_STAGE_HEIGHT * 2
+DEFAULT_2_STAGE_WIDTH = DEFAULT_1_STAGE_WIDTH * 2
+DEFAULT_NUM_FRAMES = 121
+DEFAULT_FRAME_RATE = 24.0
+DEFAULT_NUM_INFERENCE_STEPS = 40
+DEFAULT_CFG_GUIDANCE_SCALE = 4.0
+# =============================================================================
+# Audio
+# =============================================================================
+AUDIO_SAMPLE_RATE = 24000
+# =============================================================================
+# LoRA
+# =============================================================================
+DEFAULT_LORA_STRENGTH = 1.0
+# =============================================================================
+# Video VAE Architecture
+# =============================================================================
+VIDEO_SCALE_FACTORS = SpatioTemporalScaleFactors.default()
+VIDEO_LATENT_CHANNELS = 128
+# =============================================================================
+# Image Preprocessing
+# =============================================================================
+# CRF (Constant Rate Factor) for H.264 encoding used in image conditioning.
+# Lower = higher quality, 0 = lossless. This mimics compression artifacts.
+DEFAULT_IMAGE_CRF = 33
+# =============================================================================
+# Prompts
+# =============================================================================
+DEFAULT_NEGATIVE_PROMPT = (
+    "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
+    "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
+    "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
+    "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
+    "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
+    "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
+    "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
+    "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
+    "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
+    "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
+    "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+)

packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py ADDED Viewed

	@@ -0,0 +1,507 @@

+import gc
+import logging
+from dataclasses import replace
+import torch
+from tqdm import tqdm
+from ltx_core.components.noisers import Noiser
+from ltx_core.components.protocols import DiffusionStepProtocol, GuiderProtocol
+from ltx_core.conditioning import (
+    ConditioningItem,
+    VideoConditionByKeyframeIndex,
+    VideoConditionByLatentIndex,
+)
+from ltx_core.model.transformer import Modality, X0Model
+from ltx_core.model.video_vae import VideoEncoder
+from ltx_core.text_encoders.gemma import GemmaTextEncoderModelBase
+from ltx_core.tools import AudioLatentTools, LatentTools, VideoLatentTools
+from ltx_core.types import AudioLatentShape, LatentState, VideoLatentShape, VideoPixelShape
+from ltx_core.utils import to_denoised, to_velocity
+from ltx_pipelines.utils.media_io import decode_image, load_image_conditioning, resize_aspect_ratio_preserving
+from ltx_pipelines.utils.types import (
+    DenoisingFunc,
+    DenoisingLoopFunc,
+    PipelineComponents,
+)
+def get_device() -> torch.device:
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    return torch.device("cpu")
+def cleanup_memory() -> None:
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+def image_conditionings_by_replacing_latent(
+    images: list[tuple[str, int, float]],
+    height: int,
+    width: int,
+    video_encoder: VideoEncoder,
+    dtype: torch.dtype,
+    device: torch.device,
+) -> list[ConditioningItem]:
+    conditionings = []
+    for image_path, frame_idx, strength in images:
+        image = load_image_conditioning(
+            image_path=image_path,
+            height=height,
+            width=width,
+            dtype=dtype,
+            device=device,
+        )
+        encoded_image = video_encoder(image)
+        conditionings.append(
+            VideoConditionByLatentIndex(
+                latent=encoded_image,
+                strength=strength,
+                latent_idx=frame_idx,
+            )
+        )
+    return conditionings
+def image_conditionings_by_adding_guiding_latent(
+    images: list[tuple[str, int, float]],
+    height: int,
+    width: int,
+    video_encoder: VideoEncoder,
+    dtype: torch.dtype,
+    device: torch.device,
+) -> list[ConditioningItem]:
+    conditionings = []
+    for image_path, frame_idx, strength in images:
+        image = load_image_conditioning(
+            image_path=image_path,
+            height=height,
+            width=width,
+            dtype=dtype,
+            device=device,
+        )
+        encoded_image = video_encoder(image)
+        conditionings.append(
+            VideoConditionByKeyframeIndex(keyframes=encoded_image, frame_idx=frame_idx, strength=strength)
+        )
+    return conditionings
+def euler_denoising_loop(
+    sigmas: torch.Tensor,
+    video_state: LatentState,
+    audio_state: LatentState,
+    stepper: DiffusionStepProtocol,
+    denoise_fn: DenoisingFunc,
+) -> tuple[LatentState, LatentState]:
+    """
+    Perform the joint audio-video denoising loop over a diffusion schedule.
+    This function iterates over all but the final value in ``sigmas`` and, at
+    each diffusion step, calls ``denoise_fn`` to obtain denoised video and
+    audio latents. The denoised latents are post-processed with their
+    respective denoise masks and clean latents, then passed to ``stepper`` to
+    advance the noisy latents one step along the diffusion schedule.
+    ### Parameters
+    sigmas:
+        A 1D tensor of noise levels (diffusion sigmas) defining the sampling
+        schedule. All steps except the last element are iterated over.
+    video_state:
+        The current video :class:`LatentState`, containing the noisy latent,
+        its clean reference latent, and the denoising mask.
+    audio_state:
+        The current audio :class:`LatentState`, analogous to ``video_state``
+        but for the audio modality.
+    stepper:
+        An implementation of :class:`DiffusionStepProtocol` that updates a
+        latent given the current latent, its denoised estimate, the full
+        ``sigmas`` schedule, and the current step index.
+    denoise_fn:
+        A callable implementing :class:`DenoisingFunc`. It is invoked as
+        ``denoise_fn(video_state, audio_state, sigmas, step_index)`` and must
+        return a tuple ``(denoised_video, denoised_audio)``, where each element
+        is a tensor with the same shape as the corresponding latent.
+    ### Returns
+    tuple[LatentState, LatentState]
+        A pair ``(video_state, audio_state)`` containing the final video and
+        audio latent states after completing the denoising loop.
+    """
+    for step_idx, _ in enumerate(tqdm(sigmas[:-1])):
+        denoised_video, denoised_audio = denoise_fn(video_state, audio_state, sigmas, step_idx)
+        denoised_video = post_process_latent(denoised_video, video_state.denoise_mask, video_state.clean_latent)
+        denoised_audio = post_process_latent(denoised_audio, audio_state.denoise_mask, audio_state.clean_latent)
+        video_state = replace(video_state, latent=stepper.step(video_state.latent, denoised_video, sigmas, step_idx))
+        audio_state = replace(audio_state, latent=stepper.step(audio_state.latent, denoised_audio, sigmas, step_idx))
+    return (video_state, audio_state)
+def gradient_estimating_euler_denoising_loop(
+    sigmas: torch.Tensor,
+    video_state: LatentState,
+    audio_state: LatentState,
+    stepper: DiffusionStepProtocol,
+    denoise_fn: DenoisingFunc,
+    ge_gamma: float = 2.0,
+) -> tuple[LatentState, LatentState]:
+    """
+    Perform the joint audio-video denoising loop using gradient-estimation sampling.
+    This function is similar to :func:`euler_denoising_loop`, but applies
+    gradient estimation to improve the denoised estimates by tracking velocity
+    changes across steps. See the referenced function for detailed parameter
+    documentation.
+    ### Parameters
+    ge_gamma:
+        Gradient estimation coefficient controlling the velocity correction term.
+        Default is 2.0. Paper: https://openreview.net/pdf?id=o2ND9v0CeK
+    sigmas, video_state, audio_state, stepper, denoise_fn:
+        See :func:`euler_denoising_loop` for parameter descriptions.
+    ### Returns
+    tuple[LatentState, LatentState]
+        See :func:`euler_denoising_loop` for return value description.
+    """
+    previous_audio_velocity = None
+    previous_video_velocity = None
+    def update_velocity_and_sample(
+        noisy_sample: torch.Tensor, denoised_sample: torch.Tensor, sigma: float, previous_velocity: torch.Tensor | None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        current_velocity = to_velocity(noisy_sample, sigma, denoised_sample)
+        if previous_velocity is not None:
+            delta_v = current_velocity - previous_velocity
+            total_velocity = ge_gamma * delta_v + previous_velocity
+            denoised_sample = to_denoised(noisy_sample, total_velocity, sigma)
+        return current_velocity, denoised_sample
+    for step_idx, _ in enumerate(tqdm(sigmas[:-1])):
+        denoised_video, denoised_audio = denoise_fn(video_state, audio_state, sigmas, step_idx)
+        denoised_video = post_process_latent(denoised_video, video_state.denoise_mask, video_state.clean_latent)
+        denoised_audio = post_process_latent(denoised_audio, audio_state.denoise_mask, audio_state.clean_latent)
+        if sigmas[step_idx + 1] == 0:
+            return replace(video_state, latent=denoised_video), replace(audio_state, latent=denoised_audio)
+        previous_video_velocity, denoised_video = update_velocity_and_sample(
+            video_state.latent, denoised_video, sigmas[step_idx], previous_video_velocity
+        )
+        previous_audio_velocity, denoised_audio = update_velocity_and_sample(
+            audio_state.latent, denoised_audio, sigmas[step_idx], previous_audio_velocity
+        )
+        video_state = replace(video_state, latent=stepper.step(video_state.latent, denoised_video, sigmas, step_idx))
+        audio_state = replace(audio_state, latent=stepper.step(audio_state.latent, denoised_audio, sigmas, step_idx))
+    return (video_state, audio_state)
+def noise_video_state(
+    output_shape: VideoPixelShape,
+    noiser: Noiser,
+    conditionings: list[ConditioningItem],
+    components: PipelineComponents,
+    dtype: torch.dtype,
+    device: torch.device,
+    noise_scale: float = 1.0,
+    initial_latent: torch.Tensor | None = None,
+) -> tuple[LatentState, VideoLatentTools]:
+    """Initialize and noise a video latent state for the diffusion pipeline.
+    Creates a video latent state from the output shape, applies conditionings,
+    and adds noise using the provided noiser. Returns the noised state and
+    video latent tools for further processing. If initial_latent is provided, it will be used to create the initial
+    state, otherwise an empty initial state will be created.
+    """
+    video_latent_shape = VideoLatentShape.from_pixel_shape(
+        shape=output_shape,
+        latent_channels=components.video_latent_channels,
+        scale_factors=components.video_scale_factors,
+    )
+    video_tools = VideoLatentTools(components.video_patchifier, video_latent_shape, output_shape.fps)
+    video_state = create_noised_state(
+        tools=video_tools,
+        conditionings=conditionings,
+        noiser=noiser,
+        dtype=dtype,
+        device=device,
+        noise_scale=noise_scale,
+        initial_latent=initial_latent,
+    )
+    return video_state, video_tools
+def noise_audio_state(
+    output_shape: VideoPixelShape,
+    noiser: Noiser,
+    conditionings: list[ConditioningItem],
+    components: PipelineComponents,
+    dtype: torch.dtype,
+    device: torch.device,
+    noise_scale: float = 1.0,
+    initial_latent: torch.Tensor | None = None,
+    denoise_mask: torch.Tensor | None = None
+) -> tuple[LatentState, AudioLatentTools]:
+    """Initialize and noise an audio latent state for the diffusion pipeline.
+    Creates an audio latent state from the output shape, applies conditionings,
+    and adds noise using the provided noiser. Returns the noised state and
+    audio latent tools for further processing. If initial_latent is provided, it will be used to create the initial
+    state, otherwise an empty initial state will be created.
+    """
+    audio_latent_shape = AudioLatentShape.from_video_pixel_shape(output_shape)
+    audio_tools = AudioLatentTools(components.audio_patchifier, audio_latent_shape)
+    audio_state = create_noised_state(
+        tools=audio_tools,
+        conditionings=conditionings,
+        noiser=noiser,
+        dtype=dtype,
+        device=device,
+        noise_scale=noise_scale,
+        initial_latent=initial_latent,
+        denoise_mask=denoise_mask,
+    )
+    return audio_state, audio_tools
+def create_noised_state(
+    tools: LatentTools,
+    conditionings: list[ConditioningItem],
+    noiser: Noiser,
+    dtype: torch.dtype,
+    device: torch.device,
+    noise_scale: float = 1.0,
+    initial_latent: torch.Tensor | None = None,
+    denoise_mask: torch.Tensor | None = None,   # <-- add
+) -> LatentState:
+    state = tools.create_initial_state(device, dtype, initial_latent)
+    state = state_with_conditionings(state, conditionings, tools)
+    if denoise_mask is not None:
+        # Convert any tensor mask into a single scalar (solid mask behavior)
+        if isinstance(denoise_mask, torch.Tensor):
+            mask_value = float(denoise_mask.mean().item())
+        else:
+            mask_value = float(denoise_mask)
+        state = replace(
+            state,
+            clean_latent=state.latent.clone(),
+            denoise_mask=torch.full_like(state.denoise_mask, mask_value),  # <- matches internal shape
+        )
+    state = noiser(state, noise_scale)
+    if denoise_mask is not None:
+        m = state.denoise_mask.to(dtype=state.latent.dtype, device=state.latent.device)
+        clean = state.clean_latent.to(dtype=state.latent.dtype, device=state.latent.device)
+        state = replace(state, latent=state.latent * m + clean * (1 - m))
+    return state
+def state_with_conditionings(
+    latent_state: LatentState, conditioning_items: list[ConditioningItem], latent_tools: LatentTools
+) -> LatentState:
+    """Apply a list of conditionings to a latent state.
+    Iterates through the conditioning items and applies each one to the latent
+    state in sequence. Returns the modified state with all conditionings applied.
+    """
+    for conditioning in conditioning_items:
+        latent_state = conditioning.apply_to(latent_state=latent_state, latent_tools=latent_tools)
+    return latent_state
+def post_process_latent(denoised: torch.Tensor, denoise_mask: torch.Tensor, clean: torch.Tensor) -> torch.Tensor:
+    """Blend denoised output with clean state based on mask."""
+    clean = clean.to(dtype=denoised.dtype)
+    denoise_mask = denoise_mask.to(dtype=denoised.dtype)
+    return denoised * denoise_mask + clean * (1 - denoise_mask)
+def modality_from_latent_state(
+    state: LatentState, context: torch.Tensor, sigma: float | torch.Tensor, enabled: bool = True
+) -> Modality:
+    """Create a Modality from a latent state.
+    Constructs a Modality object with the latent state's data, timesteps derived
+    from the denoise mask and sigma, positions, and the provided context.
+    """
+    return Modality(
+        enabled=enabled,
+        latent=state.latent,
+        timesteps=timesteps_from_mask(state.denoise_mask, sigma),
+        positions=state.positions,
+        context=context,
+        context_mask=None,
+    )
+def timesteps_from_mask(denoise_mask: torch.Tensor, sigma: float | torch.Tensor) -> torch.Tensor:
+    """Compute timesteps from a denoise mask and sigma value.
+    Multiplies the denoise mask by sigma to produce timesteps for each position
+    in the latent state. Areas where the mask is 0 will have zero timesteps.
+    """
+    return denoise_mask * sigma
+def simple_denoising_func(
+    video_context: torch.Tensor, audio_context: torch.Tensor, transformer: X0Model
+) -> DenoisingFunc:
+    def simple_denoising_step(
+        video_state: LatentState, audio_state: LatentState, sigmas: torch.Tensor, step_index: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        sigma = sigmas[step_index]
+        pos_video = modality_from_latent_state(video_state, video_context, sigma)
+        pos_audio = modality_from_latent_state(audio_state, audio_context, sigma)
+        denoised_video, denoised_audio = transformer(video=pos_video, audio=pos_audio, perturbations=None)
+        return denoised_video, denoised_audio
+    return simple_denoising_step
+def guider_denoising_func(
+    guider: GuiderProtocol,
+    v_context_p: torch.Tensor,
+    v_context_n: torch.Tensor,
+    a_context_p: torch.Tensor,
+    a_context_n: torch.Tensor,
+    transformer: X0Model,
+) -> DenoisingFunc:
+    def guider_denoising_step(
+        video_state: LatentState, audio_state: LatentState, sigmas: torch.Tensor, step_index: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        sigma = sigmas[step_index]
+        pos_video = modality_from_latent_state(video_state, v_context_p, sigma)
+        pos_audio = modality_from_latent_state(audio_state, a_context_p, sigma)
+        denoised_video, denoised_audio = transformer(video=pos_video, audio=pos_audio, perturbations=None)
+        if guider.enabled():
+            neg_video = modality_from_latent_state(video_state, v_context_n, sigma)
+            neg_audio = modality_from_latent_state(audio_state, a_context_n, sigma)
+            neg_denoised_video, neg_denoised_audio = transformer(video=neg_video, audio=neg_audio, perturbations=None)
+            denoised_video = denoised_video + guider.delta(denoised_video, neg_denoised_video)
+            denoised_audio = denoised_audio + guider.delta(denoised_audio, neg_denoised_audio)
+        return denoised_video, denoised_audio
+    return guider_denoising_step
+def denoise_audio_video(  # noqa: PLR0913
+    output_shape: VideoPixelShape,
+    conditionings: list[ConditioningItem],
+    noiser: Noiser,
+    sigmas: torch.Tensor,
+    stepper: DiffusionStepProtocol,
+    denoising_loop_fn: DenoisingLoopFunc,
+    components: PipelineComponents,
+    dtype: torch.dtype,
+    device: torch.device,
+    audio_conditionings: list[ConditioningItem] | None = None,
+    noise_scale: float = 1.0,
+    initial_video_latent: torch.Tensor | None = None,
+    initial_audio_latent: torch.Tensor | None = None,
+    # mask_context: MaskInjection | None = None,
+) -> tuple[LatentState | None, LatentState | None]:
+    video_state, video_tools = noise_video_state(
+        output_shape=output_shape,
+        noiser=noiser,
+        conditionings=conditionings,
+        components=components,
+        dtype=dtype,
+        device=device,
+        noise_scale=noise_scale,
+        initial_latent=initial_video_latent,
+    )
+    audio_state, audio_tools = noise_audio_state(
+        output_shape=output_shape,
+        noiser=noiser,
+        conditionings=audio_conditionings or [],
+        components=components,
+        dtype=dtype,
+        device=device,
+        noise_scale=noise_scale,
+        initial_latent=initial_audio_latent,
+    )
+    loop_kwargs = {}
+    # if "preview_tools" in inspect.signature(denoising_loop_fn).parameters:
+    #     loop_kwargs["preview_tools"] = video_tools
+    # if "mask_context" in inspect.signature(denoising_loop_fn).parameters:
+    #     loop_kwargs["mask_context"] = mask_context
+    video_state, audio_state = denoising_loop_fn(
+        sigmas,
+        video_state,
+        audio_state,
+        stepper,
+        **loop_kwargs,
+    )
+    if video_state is None or audio_state is None:
+        return None, None
+    video_state = video_tools.clear_conditioning(video_state)
+    video_state = video_tools.unpatchify(video_state)
+    audio_state = audio_tools.clear_conditioning(audio_state)
+    audio_state = audio_tools.unpatchify(audio_state)
+    return video_state, audio_state
+_UNICODE_REPLACEMENTS = str.maketrans("\u2018\u2019\u201c\u201d\u2014\u2013\u00a0\u2032\u2212", "''\"\"-- '-")
+def clean_response(text: str) -> str:
+    """Clean a response from curly quotes and leading non-letter characters which Gemma tends to insert."""
+    text = text.translate(_UNICODE_REPLACEMENTS)
+    # Remove leading non-letter characters
+    for i, char in enumerate(text):
+        if char.isalpha():
+            return text[i:]
+    return text
+def generate_enhanced_prompt(
+    text_encoder: GemmaTextEncoderModelBase,
+    prompt: str,
+    image_path: str | None = None,
+    image_long_side: int = 896,
+    seed: int = 42,
+) -> str:
+    """Generate an enhanced prompt from a text encoder and a prompt."""
+    image = None
+    if image_path:
+        image = decode_image(image_path=image_path)
+        image = torch.tensor(image)
+        image = resize_aspect_ratio_preserving(image, image_long_side).to(torch.uint8)
+        prompt = text_encoder.enhance_i2v(prompt, image, seed=seed)
+    else:
+        prompt = text_encoder.enhance_t2v(prompt, seed=seed)
+    logging.info(f"Enhanced prompt: {prompt}")
+    return clean_response(prompt)
+def assert_resolution(height: int, width: int, is_two_stage: bool) -> None:
+    """Assert that the resolution is divisible by the required divisor.
+    For two-stage pipelines, the resolution must be divisible by 64.
+    For one-stage pipelines, the resolution must be divisible by 32.
+    """
+    divisor = 64 if is_two_stage else 32
+    if height % divisor != 0 or width % divisor != 0:
+        raise ValueError(
+            f"Resolution ({height}x{width}) is not divisible by {divisor}. "
+            f"For {'two-stage' if is_two_stage else 'one-stage'} pipelines, "
+            f"height and width must be multiples of {divisor}."
+        )

packages/ltx-pipelines/src/ltx_pipelines/utils/media_io.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import math
+from collections.abc import Generator, Iterator
+from fractions import Fraction
+from io import BytesIO
+import av
+import numpy as np
+import torch
+from einops import rearrange
+from PIL import Image
+from torch._prims_common import DeviceLikeType
+from tqdm import tqdm
+from ltx_pipelines.utils.constants import DEFAULT_IMAGE_CRF
+def resize_aspect_ratio_preserving(image: torch.Tensor, long_side: int) -> torch.Tensor:
+    """
+    Resize image preserving aspect ratio (filling target long side).
+    Preserves the input dimensions order.
+    Args:
+        image: Input image tensor with shape (F (optional), H, W, C)
+        long_side: Target long side size.
+    Returns:
+        Tensor with shape (F (optional), H, W, C) F = 1 if input is 3D, otherwise input shape[0]
+    """
+    height, width = image.shape[-3:2]
+    max_side = max(height, width)
+    scale = long_side / float(max_side)
+    target_height = int(height * scale)
+    target_width = int(width * scale)
+    resized = resize_and_center_crop(image, target_height, target_width)
+    # rearrange and remove batch dimension
+    result = rearrange(resized, "b c f h w -> b f h w c")[0]
+    # preserve input dimensions
+    return result[0] if result.shape[0] == 1 else result
+def resize_and_center_crop(tensor: torch.Tensor, height: int, width: int) -> torch.Tensor:
+    """
+    Resize tensor preserving aspect ratio (filling target), then center crop to exact dimensions.
+    Args:
+        latent: Input tensor with shape (H, W, C) or (F, H, W, C)
+        height: Target height
+        width: Target width
+    Returns:
+        Tensor with shape (1, C, 1, height, width) for 3D input or (1, C, F, height, width) for 4D input
+    """
+    if tensor.ndim == 3:
+        tensor = rearrange(tensor, "h w c -> 1 c h w")
+    elif tensor.ndim == 4:
+        tensor = rearrange(tensor, "f h w c -> f c h w")
+    else:
+        raise ValueError(f"Expected input with 3 or 4 dimensions; got shape {tensor.shape}.")
+    _, _, src_h, src_w = tensor.shape
+    scale = max(height / src_h, width / src_w)
+    # Use ceil to avoid floating-point rounding causing new_h/new_w to be
+    # slightly smaller than target, which would result in negative crop offsets.
+    new_h = math.ceil(src_h * scale)
+    new_w = math.ceil(src_w * scale)
+    tensor = torch.nn.functional.interpolate(tensor, size=(new_h, new_w), mode="bilinear", align_corners=False)
+    crop_top = (new_h - height) // 2
+    crop_left = (new_w - width) // 2
+    tensor = tensor[:, :, crop_top : crop_top + height, crop_left : crop_left + width]
+    tensor = rearrange(tensor, "f c h w -> 1 c f h w")
+    return tensor
+def normalize_latent(latent: torch.Tensor, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+    return (latent / 127.5 - 1.0).to(device=device, dtype=dtype)
+def load_image_conditioning(
+    image_path: str, height: int, width: int, dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    """
+    Loads an image from a path and preprocesses it for conditioning.
+    Note: The image is resized to the nearest multiple of 2 for compatibility with video codecs.
+    """
+    image = decode_image(image_path=image_path)
+    image = preprocess(image=image)
+    image = torch.tensor(image, dtype=torch.float32, device=device)
+    image = resize_and_center_crop(image, height, width)
+    image = normalize_latent(image, device, dtype)
+    return image
+def load_video_conditioning(
+    video_path: str, height: int, width: int, frame_cap: int, dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    """
+    Loads a video from a path and preprocesses it for conditioning.
+    Note: The video is resized to the nearest multiple of 2 for compatibility with video codecs.
+    """
+    frames = decode_video_from_file(path=video_path, frame_cap=frame_cap, device=device)
+    result = None
+    for f in frames:
+        frame = resize_and_center_crop(f.to(torch.float32), height, width)
+        frame = normalize_latent(frame, device, dtype)
+        result = frame if result is None else torch.cat([result, frame], dim=2)
+    return result
+def decode_image(image_path: str) -> np.ndarray:
+    image = Image.open(image_path)
+    np_array = np.array(image)[..., :3]
+    return np_array
+def _write_audio(
+    container: av.container.Container, audio_stream: av.audio.AudioStream, samples: torch.Tensor, audio_sample_rate: int
+) -> None:
+    if samples.ndim == 1:
+        samples = samples[:, None]
+    if samples.shape[1] != 2 and samples.shape[0] == 2:
+        samples = samples.T
+    if samples.shape[1] != 2:
+        raise ValueError(f"Expected samples with 2 channels; got shape {samples.shape}.")
+    # Convert to int16 packed for ingestion; resampler converts to encoder fmt.
+    if samples.dtype != torch.int16:
+        samples = torch.clip(samples, -1.0, 1.0)
+        samples = (samples * 32767.0).to(torch.int16)
+    frame_in = av.AudioFrame.from_ndarray(
+        samples.contiguous().reshape(1, -1).cpu().numpy(),
+        format="s16",
+        layout="stereo",
+    )
+    frame_in.sample_rate = audio_sample_rate
+    _resample_audio(container, audio_stream, frame_in)
+def _prepare_audio_stream(container: av.container.Container, audio_sample_rate: int) -> av.audio.AudioStream:
+    """
+    Prepare the audio stream for writing.
+    """
+    audio_stream = container.add_stream("aac", rate=audio_sample_rate)
+    audio_stream.codec_context.sample_rate = audio_sample_rate
+    audio_stream.codec_context.layout = "stereo"
+    audio_stream.codec_context.time_base = Fraction(1, audio_sample_rate)
+    return audio_stream
+def _resample_audio(
+    container: av.container.Container, audio_stream: av.audio.AudioStream, frame_in: av.AudioFrame
+) -> None:
+    cc = audio_stream.codec_context
+    # Use the encoder's format/layout/rate as the *target*
+    target_format = cc.format or "fltp"  # AAC → usually fltp
+    target_layout = cc.layout or "stereo"
+    target_rate = cc.sample_rate or frame_in.sample_rate
+    audio_resampler = av.audio.resampler.AudioResampler(
+        format=target_format,
+        layout=target_layout,
+        rate=target_rate,
+    )
+    audio_next_pts = 0
+    for rframe in audio_resampler.resample(frame_in):
+        if rframe.pts is None:
+            rframe.pts = audio_next_pts
+        audio_next_pts += rframe.samples
+        rframe.sample_rate = frame_in.sample_rate
+        container.mux(audio_stream.encode(rframe))
+    # flush audio encoder
+    for packet in audio_stream.encode():
+        container.mux(packet)
+def encode_video(
+    video: torch.Tensor | Iterator[torch.Tensor],
+    fps: int,
+    audio: torch.Tensor | None,
+    audio_sample_rate: int | None,
+    output_path: str,
+    video_chunks_number: int,
+) -> None:
+    if isinstance(video, torch.Tensor):
+        video = iter([video])
+    first_chunk = next(video)
+    _, height, width, _ = first_chunk.shape
+    container = av.open(output_path, mode="w")
+    stream = container.add_stream("libx264", rate=int(fps))
+    stream.width = width
+    stream.height = height
+    stream.pix_fmt = "yuv420p"
+    if audio is not None:
+        if audio_sample_rate is None:
+            raise ValueError("audio_sample_rate is required when audio is provided")
+        audio_stream = _prepare_audio_stream(container, audio_sample_rate)
+    def all_tiles(
+        first_chunk: torch.Tensor, tiles_generator: Generator[tuple[torch.Tensor, int], None, None]
+    ) -> Generator[tuple[torch.Tensor, int], None, None]:
+        yield first_chunk
+        yield from tiles_generator
+    for video_chunk in tqdm(all_tiles(first_chunk, video), total=video_chunks_number):
+        video_chunk_cpu = video_chunk.to("cpu").numpy()
+        for frame_array in video_chunk_cpu:
+            frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
+            for packet in stream.encode(frame):
+                container.mux(packet)
+    # Flush encoder
+    for packet in stream.encode():
+        container.mux(packet)
+    if audio is not None:
+        _write_audio(container, audio_stream, audio, audio_sample_rate)
+    container.close()
+def decode_audio_from_file(path: str, device: torch.device) -> torch.Tensor | None:
+    container = av.open(path)
+    try:
+        audio = []
+        audio_stream = next(s for s in container.streams if s.type == "audio")
+        for frame in container.decode(audio_stream):
+            audio.append(torch.tensor(frame.to_ndarray(), dtype=torch.float32, device=device).unsqueeze(0))
+        container.close()
+        audio = torch.cat(audio)
+    except StopIteration:
+        audio = None
+    finally:
+        container.close()
+    return audio
+def decode_video_from_file(path: str, frame_cap: int, device: DeviceLikeType) -> Generator[torch.Tensor]:
+    container = av.open(path)
+    try:
+        video_stream = next(s for s in container.streams if s.type == "video")
+        for frame in container.decode(video_stream):
+            tensor = torch.tensor(frame.to_rgb().to_ndarray(), dtype=torch.uint8, device=device).unsqueeze(0)
+            yield tensor
+            frame_cap = frame_cap - 1
+            if frame_cap == 0:
+                break
+    finally:
+        container.close()
+def encode_single_frame(output_file: str, image_array: np.ndarray, crf: float) -> None:
+    container = av.open(output_file, "w", format="mp4")
+    try:
+        stream = container.add_stream("libx264", rate=1, options={"crf": str(crf), "preset": "veryfast"})
+        # Round to nearest multiple of 2 for compatibility with video codecs
+        height = image_array.shape[0] // 2 * 2
+        width = image_array.shape[1] // 2 * 2
+        image_array = image_array[:height, :width]
+        stream.height = height
+        stream.width = width
+        av_frame = av.VideoFrame.from_ndarray(image_array, format="rgb24").reformat(format="yuv420p")
+        container.mux(stream.encode(av_frame))
+        container.mux(stream.encode())
+    finally:
+        container.close()
+def decode_single_frame(video_file: str) -> np.array:
+    container = av.open(video_file)
+    try:
+        stream = next(s for s in container.streams if s.type == "video")
+        frame = next(container.decode(stream))
+    finally:
+        container.close()
+    return frame.to_ndarray(format="rgb24")
+def preprocess(image: np.array, crf: float = DEFAULT_IMAGE_CRF) -> np.array:
+    if crf == 0:
+        return image
+    with BytesIO() as output_file:
+        encode_single_frame(output_file, image, crf)
+        video_bytes = output_file.getvalue()
+    with BytesIO(video_bytes) as video_file:
+        image_array = decode_single_frame(video_file)
+    return image_array

packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py ADDED Viewed

	@@ -0,0 +1,275 @@

+from dataclasses import replace
+import torch
+from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
+from ltx_core.loader.registry import DummyRegistry, Registry
+from ltx_core.loader.single_gpu_model_builder import SingleGPUModelBuilder as Builder
+from ltx_core.model.audio_vae import (
+    AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
+    VOCODER_COMFY_KEYS_FILTER,
+    AudioDecoder,
+    AudioDecoderConfigurator,
+    Vocoder,
+    VocoderConfigurator,
+)
+from ltx_core.model.transformer import (
+    LTXV_MODEL_COMFY_RENAMING_MAP,
+    LTXV_MODEL_COMFY_RENAMING_WITH_TRANSFORMER_LINEAR_DOWNCAST_MAP,
+    UPCAST_DURING_INFERENCE,
+    LTXModelConfigurator,
+    X0Model,
+)
+from ltx_core.model.upsampler import LatentUpsampler, LatentUpsamplerConfigurator
+from ltx_core.model.video_vae import (
+    VAE_DECODER_COMFY_KEYS_FILTER,
+    VAE_ENCODER_COMFY_KEYS_FILTER,
+    VideoDecoder,
+    VideoDecoderConfigurator,
+    VideoEncoder,
+    VideoEncoderConfigurator,
+)
+from ltx_core.text_encoders.gemma import (
+    AV_GEMMA_TEXT_ENCODER_KEY_OPS,
+    AVGemmaTextEncoderModel,
+    AVGemmaTextEncoderModelConfigurator,
+    module_ops_from_gemma_root,
+)
+from ltx_core.model.audio_vae import (
+    AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
+    VOCODER_COMFY_KEYS_FILTER,
+    AudioDecoder,
+    AudioDecoderConfigurator,
+    Vocoder,
+    VocoderConfigurator,
+    AudioEncoder,
+)
+from ltx_core.model.audio_vae.model_configurator import (
+    AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
+    AudioEncoderConfigurator,
+)
+class ModelLedger:
+    """
+    Central coordinator for loading and building     models used in an LTX pipeline.
+    The ledger wires together multiple model builders (transformer, video VAE encoder/decoder,
+    audio VAE decoder, vocoder, text encoder, and optional latent upsampler) and exposes
+    factory methods for constructing model instances.
+    ### Model Building
+    Each model method (e.g. :meth:`transformer`, :meth:`video_decoder`, :meth:`text_encoder`)
+    constructs a new model instance on each call. The builder uses the
+    :class:`~ltx_core.loader.registry.Registry` to load weights from the checkpoint,
+    instantiates the model with the configured ``dtype``, and moves it to ``self.device``.
+    .. note::
+        Models are **not cached**. Each call to a model method creates a new instance.
+        Callers are responsible for storing references to models they wish to reuse
+        and for freeing GPU memory (e.g. by deleting references and calling
+        ``torch.cuda.empty_cache()``).
+    ### Constructor parameters
+    dtype:
+        Torch dtype used when constructing all models (e.g. ``torch.bfloat16``).
+    device:
+        Target device to which models are moved after construction (e.g. ``torch.device("cuda")``).
+    checkpoint_path:
+        Path to a checkpoint directory or file containing the core model weights
+        (transformer, video VAE, audio VAE, text encoder, vocoder). If ``None``, the
+        corresponding builders are not created and calling those methods will raise
+        a :class:`ValueError`.
+    gemma_root_path:
+        Base path to Gemma-compatible CLIP/text encoder weights. Required to
+        initialize the text encoder builder; if omitted, :meth:`text_encoder` cannot be used.
+    spatial_upsampler_path:
+        Optional path to a latent upsampler checkpoint. If provided, the
+        :meth:`spatial_upsampler` method becomes available; otherwise calling it raises
+        a :class:`ValueError`.
+    loras:
+        Optional collection of LoRA configurations (paths, strengths, and key operations)
+        that are applied on top of the base transformer weights when building the model.
+    registry:
+        Optional :class:`Registry` instance for weight caching across builders.
+        Defaults to :class:`DummyRegistry` which performs no cross-builder caching.
+    fp8transformer:
+        If ``True``, builds the transformer with FP8 quantization and upcasting during inference.
+    ### Creating Variants
+    Use :meth:`with_loras` to create a new ``ModelLedger`` instance that includes
+    additional LoRA configurations while sharing the same registry for weight caching.
+    """
+    def __init__(
+        self,
+        dtype: torch.dtype,
+        device: torch.device,
+        checkpoint_path: str | None = None,
+        gemma_root_path: str | None = None,
+        spatial_upsampler_path: str | None = None,
+        loras: LoraPathStrengthAndSDOps | None = None,
+        registry: Registry | None = None,
+        fp8transformer: bool = False,
+        local_files_only: bool = True
+    ):
+        self.dtype = dtype
+        self.device = device
+        self.checkpoint_path = checkpoint_path
+        self.gemma_root_path = gemma_root_path
+        self.spatial_upsampler_path = spatial_upsampler_path
+        self.loras = loras or ()
+        self.registry = registry or DummyRegistry()
+        self.fp8transformer = fp8transformer
+        self.local_files_only = local_files_only
+        self.build_model_builders()
+    def build_model_builders(self) -> None:
+        if self.checkpoint_path is not None:
+            self.transformer_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=LTXModelConfigurator,
+                model_sd_ops=LTXV_MODEL_COMFY_RENAMING_MAP,
+                loras=tuple(self.loras),
+                registry=self.registry,
+            )
+            self.vae_decoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=VideoDecoderConfigurator,
+                model_sd_ops=VAE_DECODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+            self.vae_encoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=VideoEncoderConfigurator,
+                model_sd_ops=VAE_ENCODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+            self.audio_decoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=AudioDecoderConfigurator,
+                model_sd_ops=AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+            self.vocoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=VocoderConfigurator,
+                model_sd_ops=VOCODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+            self.audio_encoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=AudioEncoderConfigurator,
+                model_sd_ops=AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+            if self.gemma_root_path is not None:
+                self.text_encoder_builder = Builder(
+                    model_path=self.checkpoint_path,
+                    model_class_configurator=AVGemmaTextEncoderModelConfigurator,
+                    model_sd_ops=AV_GEMMA_TEXT_ENCODER_KEY_OPS,
+                    registry=self.registry,
+                    module_ops=module_ops_from_gemma_root(self.gemma_root_path,self.local_files_only),
+                )
+        if self.spatial_upsampler_path is not None:
+            self.upsampler_builder = Builder(
+                model_path=self.spatial_upsampler_path,
+                model_class_configurator=LatentUpsamplerConfigurator,
+                registry=self.registry,
+            )
+    def _target_device(self) -> torch.device:
+        if isinstance(self.registry, DummyRegistry) or self.registry is None:
+            return self.device
+        else:
+            return torch.device("cpu")
+    def with_loras(self, loras: LoraPathStrengthAndSDOps) -> "ModelLedger":
+        return ModelLedger(
+            dtype=self.dtype,
+            device=self.device,
+            checkpoint_path=self.checkpoint_path,
+            gemma_root_path=self.gemma_root_path,
+            spatial_upsampler_path=self.spatial_upsampler_path,
+            loras=(*self.loras, *loras),
+            registry=self.registry,
+            fp8transformer=self.fp8transformer,
+        )
+    def transformer(self) -> X0Model:
+        if not hasattr(self, "transformer_builder"):
+            raise ValueError(
+                "Transformer not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        if self.fp8transformer:
+            fp8_builder = replace(
+                self.transformer_builder,
+                module_ops=(UPCAST_DURING_INFERENCE,),
+                model_sd_ops=LTXV_MODEL_COMFY_RENAMING_WITH_TRANSFORMER_LINEAR_DOWNCAST_MAP,
+            )
+            return X0Model(fp8_builder.build(device=self._target_device())).to(self.device).eval()
+        else:
+            return (
+                X0Model(self.transformer_builder.build(device=self._target_device(), dtype=self.dtype))
+                .to(self.device)
+                .eval()
+            )
+    def audio_encoder(self) -> AudioEncoder:
+        if not hasattr(self, "audio_encoder_builder"):
+            raise ValueError(
+                "Audio encoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.audio_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+    def video_decoder(self) -> VideoDecoder:
+        if not hasattr(self, "vae_decoder_builder"):
+            raise ValueError(
+                "Video decoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.vae_decoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+    def video_encoder(self) -> VideoEncoder:
+        if not hasattr(self, "vae_encoder_builder"):
+            raise ValueError(
+                "Video encoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.vae_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+    def text_encoder(self) -> AVGemmaTextEncoderModel:
+        if not hasattr(self, "text_encoder_builder"):
+            raise ValueError(
+                "Text encoder not initialized. Please provide a checkpoint path and gemma root path to the "
+                "ModelLedger constructor."
+            )
+        return self.text_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+    def audio_decoder(self) -> AudioDecoder:
+        if not hasattr(self, "audio_decoder_builder"):
+            raise ValueError(
+                "Audio decoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.audio_decoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+    def vocoder(self) -> Vocoder:
+        if not hasattr(self, "vocoder_builder"):
+            raise ValueError(
+                "Vocoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+        return self.vocoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+    def spatial_upsampler(self) -> LatentUpsampler:
+        if not hasattr(self, "upsampler_builder"):
+            raise ValueError("Upsampler not initialized. Please provide upsampler path to the ModelLedger constructor.")
+        return self.upsampler_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()

packages/ltx-pipelines/src/ltx_pipelines/utils/types.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from typing import Protocol
+import torch
+from ltx_core.components.patchifiers import AudioPatchifier, VideoLatentPatchifier
+from ltx_core.components.protocols import DiffusionStepProtocol
+from ltx_core.types import LatentState
+from ltx_pipelines.utils.constants import VIDEO_LATENT_CHANNELS, VIDEO_SCALE_FACTORS
+class PipelineComponents:
+    """
+    Container class for pipeline components used throughout the LTX pipelines.
+    Attributes:
+        dtype (torch.dtype): Default torch dtype for tensors in the pipeline.
+        device (torch.device): Target device to place tensors and modules on.
+        video_scale_factors (SpatioTemporalScaleFactors): Scale factors (T, H, W) for VAE latent space.
+        video_latent_channels (int): Number of channels in the video latent representation.
+        video_patchifier (VideoLatentPatchifier): Patchifier instance for video latents.
+        audio_patchifier (AudioPatchifier): Patchifier instance for audio latents.
+    """
+    def __init__(
+        self,
+        dtype: torch.dtype,
+        device: torch.device,
+    ):
+        self.dtype = dtype
+        self.device = device
+        self.video_scale_factors = VIDEO_SCALE_FACTORS
+        self.video_latent_channels = VIDEO_LATENT_CHANNELS
+        self.video_patchifier = VideoLatentPatchifier(patch_size=1)
+        self.audio_patchifier = AudioPatchifier(patch_size=1)
+class DenoisingFunc(Protocol):
+    """
+    Protocol for a denoising function used in the LTX pipeline.
+    Args:
+        video_state (LatentState): The current latent state for video.
+        audio_state (LatentState): The current latent state for audio.
+        sigmas (torch.Tensor): A 1D tensor of sigma values for each diffusion step.
+        step_index (int): Index of the current denoising step.
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The denoised video and audio tensors.
+    """
+    def __call__(
+        self, video_state: LatentState, audio_state: LatentState, sigmas: torch.Tensor, step_index: int
+    ) -> tuple[torch.Tensor, torch.Tensor]: ...
+class DenoisingLoopFunc(Protocol):
+    """
+    Protocol for a denoising loop function used in the LTX pipeline.
+    Args:
+        sigmas (torch.Tensor): A 1D tensor of sigma values for each diffusion step.
+        video_state (LatentState): The current latent state for video.
+        audio_state (LatentState): The current latent state for audio.
+        stepper (DiffusionStepProtocol): The diffusion step protocol to use.
+    Returns:
+        tuple[LatentState, LatentState]: The denoised video and audio latent states.
+    """
+    def __call__(
+        self,
+        sigmas: torch.Tensor,
+        video_state: LatentState,
+        audio_state: LatentState,
+        stepper: DiffusionStepProtocol,
+    ) -> tuple[torch.Tensor, torch.Tensor]: ...