SUPIR

Running on Zero

App Files Files Community

Fabrice-TIERCELIN commited on 25 days ago

Commit

b323cbe

verified ·

1 Parent(s): de02dc3

Delete packages/ltx-pipelines/ltx_pipelines

Browse files

Files changed (12) hide show

packages/ltx-pipelines/ltx_pipelines/__init__.py +0 -26
packages/ltx-pipelines/ltx_pipelines/distilled.py +0 -475
packages/ltx-pipelines/ltx_pipelines/ic_lora.py +0 -309
packages/ltx-pipelines/ltx_pipelines/keyframe_interpolation.py +0 -273
packages/ltx-pipelines/ltx_pipelines/ti2vid_one_stage.py +0 -193
packages/ltx-pipelines/ltx_pipelines/ti2vid_two_stages.py +0 -276
packages/ltx-pipelines/ltx_pipelines/utils/args.py +0 -277
packages/ltx-pipelines/ltx_pipelines/utils/constants.py +0 -77
packages/ltx-pipelines/ltx_pipelines/utils/helpers.py +0 -507
packages/ltx-pipelines/ltx_pipelines/utils/media_io.py +0 -299
packages/ltx-pipelines/ltx_pipelines/utils/model_ledger.py +0 -275
packages/ltx-pipelines/ltx_pipelines/utils/types.py +0 -73

packages/ltx-pipelines/ltx_pipelines/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-"""
-LTX-2 Pipelines: High-level video generation pipelines and utilities.
-This package provides ready-to-use pipelines for video generation:
-- TI2VidOneStagePipeline: Text/image-to-video in a single stage
-- TI2VidTwoStagesPipeline: Two-stage generation with upsampling
-- DistilledPipeline: Fast distilled two-stage generation
-- ICLoraPipeline: Image/video conditioning with distilled LoRA
-- KeyframeInterpolationPipeline: Keyframe-based video interpolation
-- ModelLedger: Central coordinator for loading and building models
-For more detailed components and utilities, import from specific submodules
-like `ltx_pipelines.utils.media_io` or `ltx_pipelines.utils.constants`.
-"""
-from ltx_pipelines.distilled import DistilledPipeline
-from ltx_pipelines.ic_lora import ICLoraPipeline
-from ltx_pipelines.keyframe_interpolation import KeyframeInterpolationPipeline
-from ltx_pipelines.ti2vid_one_stage import TI2VidOneStagePipeline
-from ltx_pipelines.ti2vid_two_stages import TI2VidTwoStagesPipeline
-__all__ = [
-    "DistilledPipeline",
-    "ICLoraPipeline",
-    "KeyframeInterpolationPipeline",
-    "TI2VidOneStagePipeline",
-    "TI2VidTwoStagesPipeline",
-]

packages/ltx-pipelines/ltx_pipelines/distilled.py DELETED Viewed

@@ -1,475 +0,0 @@
-# Copyright (c) 2025 Lightricks. All rights reserved.
-# Created by Amit Pintz.
-import torch
-from ltx_core.components.diffusion_steps import EulerDiffusionStep
-from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
-from ltx_core.components.noisers import GaussianNoiser
-from ltx_core.components.protocols import DiffusionStepProtocol
-from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex, ConditioningError
-from ltx_core.loader import LoraPathStrengthAndSDOps
-from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
-from ltx_core.model.upsampler import upsample_video
-from ltx_core.model.video_vae import TilingConfig, VideoEncoder, get_video_chunks_number
-from ltx_core.model.video_vae import decode_video as vae_decode_video
-from ltx_core.text_encoders.gemma import encode_text
-from ltx_core.types import LatentState, VideoPixelShape
-from ltx_core.tools import LatentTools
-from ltx_pipelines import utils
-from ltx_pipelines.utils import ModelLedger
-from ltx_pipelines.utils.args import default_2_stage_distilled_arg_parser
-from ltx_pipelines.utils.constants import (
-    AUDIO_SAMPLE_RATE,
-    DEFAULT_LORA_STRENGTH,
-    DISTILLED_SIGMA_VALUES,
-    STAGE_2_DISTILLED_SIGMA_VALUES,
-)
-from ltx_pipelines.utils.helpers import (
-    assert_resolution,
-    cleanup_memory,
-    denoise_audio_video,
-    euler_denoising_loop,
-    generate_enhanced_prompt,
-    get_device,
-    image_conditionings_by_replacing_latent,
-    simple_denoising_func,
-)
-from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
-from ltx_pipelines.utils.types import PipelineComponents
-import torchaudio
-from ltx_core.model.audio_vae import AudioProcessor
-from ltx_core.types import AudioLatentShape, VideoPixelShape
-class AudioConditionByLatent(ConditioningItem):
-    """
-    Conditions audio generation by injecting a full latent sequence.
-    Replaces tokens in the latent state with the provided audio latents,
-    and sets denoise strength according to the strength parameter.
-    """
-    def __init__(self, latent: torch.Tensor, strength: float):
-        self.latent = latent
-        self.strength = strength
-    def apply_to(self, latent_state: LatentState, latent_tools: LatentTools) -> LatentState:
-        if not isinstance(latent_tools.target_shape, AudioLatentShape):
-            raise ConditioningError("Audio conditioning requires an audio latent target shape.")
-        cond_batch, cond_channels, cond_frames, cond_bins = self.latent.shape
-        tgt_batch, tgt_channels, tgt_frames, tgt_bins = latent_tools.target_shape.to_torch_shape()
-        if (cond_batch, cond_channels, cond_frames, cond_bins) != (tgt_batch, tgt_channels, tgt_frames, tgt_bins):
-            raise ConditioningError(
-                f"Can't apply audio conditioning item to latent with shape {latent_tools.target_shape}, expected "
-                f"shape is ({tgt_batch}, {tgt_channels}, {tgt_frames}, {tgt_bins})."
-            )
-        tokens = latent_tools.patchifier.patchify(self.latent)
-        latent_state = latent_state.clone()
-        latent_state.latent[:, : tokens.shape[1]] = tokens
-        latent_state.clean_latent[:, : tokens.shape[1]] = tokens
-        latent_state.denoise_mask[:, : tokens.shape[1]] = 1.0 - self.strength
-        return latent_state
-device = get_device()
-class DistilledPipeline:
-    def __init__(
-        self,
-        checkpoint_path: str,
-        gemma_root: str,
-        spatial_upsampler_path: str,
-        loras: list[LoraPathStrengthAndSDOps],
-        device: torch.device = device,
-        fp8transformer: bool = False,
-        local_files_only: bool = True,
-    ):
-        self.device = device
-        self.dtype = torch.bfloat16
-        self.model_ledger = ModelLedger(
-            dtype=self.dtype,
-            device=device,
-            checkpoint_path=checkpoint_path,
-            spatial_upsampler_path=spatial_upsampler_path,
-            gemma_root_path=gemma_root,
-            loras=loras,
-            fp8transformer=fp8transformer,
-            local_files_only=local_files_only
-        )
-        self.pipeline_components = PipelineComponents(
-            dtype=self.dtype,
-            device=device,
-        )
-        # Cached models to avoid reloading
-        self._video_encoder = None
-        self._transformer = None
-    def _build_audio_conditionings_from_waveform(
-        self,
-        input_waveform: torch.Tensor,
-        input_sample_rate: int,
-        num_frames: int,
-        fps: float,
-        strength: float,
-    ) -> list[AudioConditionByLatent] | None:
-        strength = float(strength)
-        if strength <= 0.0:
-            return None
-        # Expect waveform as:
-        #  - (T,) or (C,T) or (B,C,T). Convert to (B,C,T)
-        waveform = input_waveform
-        if waveform.ndim == 1:
-            waveform = waveform.unsqueeze(0).unsqueeze(0)
-        elif waveform.ndim == 2:
-            waveform = waveform.unsqueeze(0)
-        elif waveform.ndim != 3:
-            raise ValueError(f"input_waveform must be 1D/2D/3D, got shape {tuple(waveform.shape)}")
-        # Get audio encoder + its config
-        audio_encoder = self.model_ledger.audio_encoder()  # assumes ledger exposes it
-        # If you want to cache it like video_encoder/transformer, you can.
-        target_sr = int(getattr(audio_encoder, "sample_rate"))
-        target_channels = int(getattr(audio_encoder, "in_channels", waveform.shape[1]))
-        mel_bins = int(getattr(audio_encoder, "mel_bins"))
-        mel_hop = int(getattr(audio_encoder, "mel_hop_length"))
-        n_fft = int(getattr(audio_encoder, "n_fft"))
-        # Match channels
-        if waveform.shape[1] != target_channels:
-            if waveform.shape[1] == 1 and target_channels > 1:
-                waveform = waveform.repeat(1, target_channels, 1)
-            elif target_channels == 1:
-                waveform = waveform.mean(dim=1, keepdim=True)
-            else:
-                waveform = waveform[:, :target_channels, :]
-                if waveform.shape[1] < target_channels:
-                    pad_ch = target_channels - waveform.shape[1]
-                    pad = torch.zeros((waveform.shape[0], pad_ch, waveform.shape[2]), dtype=waveform.dtype)
-                    waveform = torch.cat([waveform, pad], dim=1)
-        # Resample if needed (CPU float32 is safest for torchaudio)
-        waveform = waveform.to(device="cpu", dtype=torch.float32)
-        if int(input_sample_rate) != target_sr:
-            waveform = torchaudio.functional.resample(waveform, int(input_sample_rate), target_sr)
-        # Waveform -> Mel
-        audio_processor = AudioProcessor(
-            sample_rate=target_sr,
-            mel_bins=mel_bins,
-            mel_hop_length=mel_hop,
-            n_fft=n_fft,
-        ).to(waveform.device)
-        mel = audio_processor.waveform_to_mel(waveform, target_sr)
-        # Mel -> latent (run encoder on its own device/dtype)
-        audio_params = next(audio_encoder.parameters(), None)
-        enc_device = audio_params.device if audio_params is not None else self.device
-        enc_dtype = audio_params.dtype if audio_params is not None else self.dtype
-        mel = mel.to(device=enc_device, dtype=enc_dtype)
-        with torch.inference_mode():
-            audio_latent = audio_encoder(mel)
-        # Pad/trim latent to match the target video duration
-        audio_downsample = getattr(getattr(audio_encoder, "patchifier", None), "audio_latent_downsample_factor", 4)
-        target_shape = AudioLatentShape.from_video_pixel_shape(
-            VideoPixelShape(batch=audio_latent.shape[0], frames=int(num_frames), width=1, height=1, fps=float(fps)),
-            channels=audio_latent.shape[1],
-            mel_bins=audio_latent.shape[3],
-            sample_rate=target_sr,
-            hop_length=mel_hop,
-            audio_latent_downsample_factor=audio_downsample,
-        )
-        target_frames = int(target_shape.frames)
-        if audio_latent.shape[2] < target_frames:
-            pad_frames = target_frames - audio_latent.shape[2]
-            pad = torch.zeros(
-                (audio_latent.shape[0], audio_latent.shape[1], pad_frames, audio_latent.shape[3]),
-                device=audio_latent.device,
-                dtype=audio_latent.dtype,
-            )
-            audio_latent = torch.cat([audio_latent, pad], dim=2)
-        elif audio_latent.shape[2] > target_frames:
-            audio_latent = audio_latent[:, :, :target_frames, :]
-        # Move latent to pipeline device/dtype for conditioning object
-        audio_latent = audio_latent.to(device=self.device, dtype=self.dtype)
-        return [AudioConditionByLatent(audio_latent, strength)]
-    def _prepare_output_waveform(
-        self,
-        input_waveform: torch.Tensor,
-        input_sample_rate: int,
-        target_sample_rate: int,
-        num_frames: int,
-        fps: float,
-    ) -> torch.Tensor:
-        """
-        Returns waveform on CPU, float32, resampled to target_sample_rate and
-        trimmed/padded to match video duration.
-        Output shape: (T,) for mono or (C, T) for multi-channel.
-        """
-        wav = input_waveform
-        # Accept (T,), (C,T), (B,C,T)
-        if wav.ndim == 3:
-            wav = wav[0]
-        elif wav.ndim == 2:
-            pass
-        elif wav.ndim == 1:
-            wav = wav.unsqueeze(0)
-        else:
-            raise ValueError(f"input_waveform must be 1D/2D/3D, got {tuple(wav.shape)}")
-        # Now wav is (C, T)
-        wav = wav.detach().to("cpu", dtype=torch.float32)
-        # Resample if needed
-        if int(input_sample_rate) != int(target_sample_rate):
-            wav = torchaudio.functional.resample(wav, int(input_sample_rate), int(target_sample_rate))
-        # Match video duration
-        duration_sec = float(num_frames) / float(fps)
-        target_len = int(round(duration_sec * float(target_sample_rate)))
-        cur_len = int(wav.shape[-1])
-        if cur_len > target_len:
-            wav = wav[..., :target_len]
-        elif cur_len < target_len:
-            pad = target_len - cur_len
-            wav = torch.nn.functional.pad(wav, (0, pad))
-        # If mono, return (T,) for convenience
-        if wav.shape[0] == 1:
-            return wav[0]
-        return wav
-    @torch.inference_mode()
-    def __call__(
-        self,
-        prompt: str,
-        output_path: str,
-        seed: int,
-        height: int,
-        width: int,
-        num_frames: int,
-        frame_rate: float,
-        images: list[tuple[str, int, float]],
-        video_conditioning: list[tuple[str, float]] | None = None,
-        video_conditioning_frame_idx: int = 0,
-        apply_video_conditioning_to_stage2: bool = False,
-        tiling_config: TilingConfig | None = None,
-        video_context: torch.Tensor | None = None,
-        audio_context: torch.Tensor | None = None,
-        input_waveform: torch.Tensor | None = None,
-        input_waveform_sample_rate: int | None = None,
-        audio_strength: float = 1.0,   # or audio_scale, your naming
-    ) -> None:
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        noiser = GaussianNoiser(generator=generator)
-        stepper = EulerDiffusionStep()
-        dtype = torch.bfloat16
-        audio_conditionings = None
-        if input_waveform is not None:
-            if input_waveform_sample_rate is None:
-                raise ValueError("input_waveform_sample_rate must be provided when input_waveform is set.")
-            audio_conditionings = self._build_audio_conditionings_from_waveform(
-                input_waveform=input_waveform,
-                input_sample_rate=int(input_waveform_sample_rate),
-                num_frames=num_frames,
-                fps=frame_rate,
-                strength=audio_strength,
-            )
-        # Use pre-computed embeddings if provided, otherwise encode text
-        if video_context is None or audio_context is None:
-            text_encoder = self.model_ledger.text_encoder()
-            context_p = encode_text(text_encoder, prompts=[prompt])[0]
-            video_context, audio_context = context_p
-            torch.cuda.synchronize()
-            del text_encoder
-            utils.cleanup_memory()
-        else:
-            # Move pre-computed embeddings to device if needed
-            video_context = video_context.to(self.device)
-            audio_context = audio_context.to(self.device)
-        # Stage 1: Initial low resolution video generation.
-        # Load models only if not already cached
-        if self._video_encoder is None:
-            self._video_encoder = self.model_ledger.video_encoder()
-        video_encoder = self._video_encoder
-        if self._transformer is None:
-            self._transformer = self.model_ledger.transformer()
-        transformer = self._transformer
-        stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
-        def denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
-        ) -> tuple[LatentState, LatentState]:
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=simple_denoising_func(
-                    video_context=video_context,
-                    audio_context=audio_context,
-                    transformer=transformer,  # noqa: F821
-                ),
-            )
-        stage_1_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_1_conditionings = self._create_conditionings(
-            images=images,
-            video_conditioning=video_conditioning,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
-            num_frames=num_frames,
-            video_encoder=video_encoder,
-            video_conditioning_frame_idx=video_conditioning_frame_idx,
-            dtype=dtype,
-        )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
-            audio_conditionings=audio_conditionings,
-            noiser=noiser,
-            sigmas=stage_1_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-        )
-        # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
-        upscaled_video_latent = upsample_video(
-            latent=video_state.latent[:1], video_encoder=video_encoder, upsampler=self.model_ledger.spatial_upsampler()
-        )
-        torch.cuda.synchronize()
-        cleanup_memory()
-        stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
-        stage_2_output_shape = VideoPixelShape(
-            batch=1, frames=num_frames, width=width * 2, height=height * 2, fps=frame_rate
-        )
-        if apply_video_conditioning_to_stage2:
-            stage_2_conditionings = self._create_conditionings(
-                images=images,
-                video_conditioning=video_conditioning,
-                height=stage_2_output_shape.height,
-                width=stage_2_output_shape.width,
-                num_frames=num_frames,
-                video_encoder=video_encoder,
-                video_conditioning_frame_idx=video_conditioning_frame_idx,
-            )
-        else:
-            stage_2_conditionings = image_conditionings_by_replacing_latent(
-                images=images,
-                height=stage_2_output_shape.height,
-                width=stage_2_output_shape.width,
-                video_encoder=video_encoder,
-                dtype=dtype,
-                device=self.device,
-            )
-        video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_2_output_shape,
-            conditionings=stage_2_conditionings,
-            audio_conditionings=audio_conditionings,
-            noiser=noiser,
-            sigmas=stage_2_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            noise_scale=stage_2_sigmas[0],
-            initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=audio_state.latent,
-        )
-        torch.cuda.synchronize()
-        # del transformer
-        # del video_encoder
-        # utils.cleanup_memory()
-        decoded_video = vae_decode_video(video_state.latent, self.model_ledger.video_decoder(), tiling_config)
-        decoded_audio = vae_decode_audio(audio_state.latent, self.model_ledger.audio_decoder(), self.model_ledger.vocoder())
-        encode_video(
-            video=decoded_video,
-            fps=frame_rate,
-            audio=decoded_audio,
-            audio_sample_rate=AUDIO_SAMPLE_RATE,
-            output_path=output_path,
-            video_chunks_number=video_chunks_number,
-        )
-    def _create_conditionings(
-        self,
-        images: list[tuple[str, int, float]],
-        video_conditioning: list[tuple[str, float]] | None,
-        height: int,
-        width: int,
-        num_frames: int,
-        video_encoder,
-        video_conditioning_frame_idx: int,
-        dtype: torch.dtype,
-    ):
-        # 1) Keep ORIGINAL behavior: image conditioning by replacing latent
-        conditionings = image_conditionings_by_replacing_latent(
-            images=images,
-            height=height,
-            width=width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
-        # 2) Optional: add video conditioning (IC-LoRA style)
-        if not video_conditioning:
-            return conditionings
-        for video_path, strength in video_conditioning:
-            video = load_video_conditioning(
-                video_path=video_path,
-                height=height,
-                width=width,
-                frame_cap=num_frames,   # ✅ correct kwarg name
-                dtype=dtype,
-                device=self.device,
-            )
-            encoded_video = video_encoder(video)
-            # ✅ match IC-LoRA: append the conditioning object directly
-            conditionings.append(
-                VideoConditionByKeyframeIndex(
-                    keyframes=encoded_video,
-                    frame_idx=video_conditioning_frame_idx,
-                    strength=strength,
-                )
-            )
-        return conditionings

packages/ltx-pipelines/ltx_pipelines/ic_lora.py DELETED Viewed

@@ -1,309 +0,0 @@
-import logging
-from collections.abc import Iterator
-import torch
-from ltx_core.components.diffusion_steps import EulerDiffusionStep
-from ltx_core.components.noisers import GaussianNoiser
-from ltx_core.components.protocols import DiffusionStepProtocol
-from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex
-from ltx_core.loader import LoraPathStrengthAndSDOps
-from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
-from ltx_core.model.upsampler import upsample_video
-from ltx_core.model.video_vae import TilingConfig, VideoEncoder, get_video_chunks_number
-from ltx_core.model.video_vae import decode_video as vae_decode_video
-from ltx_core.text_encoders.gemma import encode_text
-from ltx_core.types import LatentState, VideoPixelShape
-from ltx_pipelines.utils import ModelLedger
-from ltx_pipelines.utils.args import VideoConditioningAction, default_2_stage_distilled_arg_parser
-from ltx_pipelines.utils.constants import (
-    AUDIO_SAMPLE_RATE,
-    DISTILLED_SIGMA_VALUES,
-    STAGE_2_DISTILLED_SIGMA_VALUES,
-)
-from ltx_pipelines.utils.helpers import (
-    assert_resolution,
-    cleanup_memory,
-    denoise_audio_video,
-    euler_denoising_loop,
-    generate_enhanced_prompt,
-    get_device,
-    image_conditionings_by_replacing_latent,
-    simple_denoising_func,
-)
-from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
-from ltx_pipelines.utils.types import PipelineComponents
-device = get_device()
-class ICLoraPipeline:
-    """
-    Two-stage video generation pipeline with In-Context (IC) LoRA support.
-    Allows conditioning the generated video on control signals such as depth maps,
-    human pose, or image edges via the video_conditioning parameter.
-    The specific IC-LoRA model should be provided via the loras parameter.
-    Stage 1 generates video at the target resolution, then Stage 2 upsamples
-    by 2x and refines with additional denoising steps for higher quality output.
-    """
-    def __init__(
-        self,
-        checkpoint_path: str,
-        spatial_upsampler_path: str,
-        gemma_root: str,
-        loras: list[LoraPathStrengthAndSDOps],
-        device: torch.device = device,
-        fp8transformer: bool = False,
-    ):
-        self.dtype = torch.bfloat16
-        self.stage_1_model_ledger = ModelLedger(
-            dtype=self.dtype,
-            device=device,
-            checkpoint_path=checkpoint_path,
-            spatial_upsampler_path=spatial_upsampler_path,
-            gemma_root_path=gemma_root,
-            loras=loras,
-            fp8transformer=fp8transformer,
-        )
-        self.stage_2_model_ledger = ModelLedger(
-            dtype=self.dtype,
-            device=device,
-            checkpoint_path=checkpoint_path,
-            spatial_upsampler_path=spatial_upsampler_path,
-            gemma_root_path=gemma_root,
-            loras=[],
-            fp8transformer=fp8transformer,
-        )
-        self.pipeline_components = PipelineComponents(
-            dtype=self.dtype,
-            device=device,
-        )
-        self.device = device
-    @torch.inference_mode()
-    def __call__(
-        self,
-        prompt: str,
-        seed: int,
-        height: int,
-        width: int,
-        num_frames: int,
-        frame_rate: float,
-        images: list[tuple[str, int, float]],
-        video_conditioning: list[tuple[str, float]],
-        enhance_prompt: bool = False,
-        tiling_config: TilingConfig | None = None,
-    ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
-        assert_resolution(height=height, width=width, is_two_stage=True)
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        noiser = GaussianNoiser(generator=generator)
-        stepper = EulerDiffusionStep()
-        dtype = torch.bfloat16
-        text_encoder = self.stage_1_model_ledger.text_encoder()
-        if enhance_prompt:
-            prompt = generate_enhanced_prompt(
-                text_encoder, prompt, images[0][0] if len(images) > 0 else None, seed=seed
-            )
-        video_context, audio_context = encode_text(text_encoder, prompts=[prompt])[0]
-        torch.cuda.synchronize()
-        del text_encoder
-        cleanup_memory()
-        # Stage 1: Initial low resolution video generation.
-        video_encoder = self.stage_1_model_ledger.video_encoder()
-        transformer = self.stage_1_model_ledger.transformer()
-        stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
-        def first_stage_denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
-        ) -> tuple[LatentState, LatentState]:
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=simple_denoising_func(
-                    video_context=video_context,
-                    audio_context=audio_context,
-                    transformer=transformer,  # noqa: F821
-                ),
-            )
-        stage_1_output_shape = VideoPixelShape(
-            batch=1,
-            frames=num_frames,
-            width=width // 2,
-            height=height // 2,
-            fps=frame_rate,
-        )
-        stage_1_conditionings = self._create_conditionings(
-            images=images,
-            video_conditioning=video_conditioning,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
-            video_encoder=video_encoder,
-            num_frames=num_frames,
-        )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
-            noiser=noiser,
-            sigmas=stage_1_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=first_stage_denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-        )
-        torch.cuda.synchronize()
-        del transformer
-        cleanup_memory()
-        # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
-        upscaled_video_latent = upsample_video(
-            latent=video_state.latent[:1],
-            video_encoder=video_encoder,
-            upsampler=self.stage_2_model_ledger.spatial_upsampler(),
-        )
-        torch.cuda.synchronize()
-        cleanup_memory()
-        transformer = self.stage_2_model_ledger.transformer()
-        distilled_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
-        def second_stage_denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
-        ) -> tuple[LatentState, LatentState]:
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=simple_denoising_func(
-                    video_context=video_context,
-                    audio_context=audio_context,
-                    transformer=transformer,  # noqa: F821
-                ),
-            )
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_2_conditionings = image_conditionings_by_replacing_latent(
-            images=images,
-            height=stage_2_output_shape.height,
-            width=stage_2_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=self.dtype,
-            device=self.device,
-        )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_2_output_shape,
-            conditionings=stage_2_conditionings,
-            noiser=noiser,
-            sigmas=distilled_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=second_stage_denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            noise_scale=distilled_sigmas[0],
-            initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=audio_state.latent,
-        )
-        torch.cuda.synchronize()
-        del transformer
-        del video_encoder
-        cleanup_memory()
-        decoded_video = vae_decode_video(video_state.latent, self.stage_2_model_ledger.video_decoder(), tiling_config)
-        decoded_audio = vae_decode_audio(
-            audio_state.latent, self.stage_2_model_ledger.audio_decoder(), self.stage_2_model_ledger.vocoder()
-        )
-        return decoded_video, decoded_audio
-    def _create_conditionings(
-        self,
-        images: list[tuple[str, int, float]],
-        video_conditioning: list[tuple[str, float]],
-        height: int,
-        width: int,
-        num_frames: int,
-        video_encoder: VideoEncoder,
-    ) -> list[ConditioningItem]:
-        conditionings = image_conditionings_by_replacing_latent(
-            images=images,
-            height=height,
-            width=width,
-            video_encoder=video_encoder,
-            dtype=self.dtype,
-            device=self.device,
-        )
-        for video_path, strength in video_conditioning:
-            video = load_video_conditioning(
-                video_path=video_path,
-                height=height,
-                width=width,
-                frame_cap=num_frames,
-                dtype=self.dtype,
-                device=self.device,
-            )
-            encoded_video = video_encoder(video)
-            conditionings.append(VideoConditionByKeyframeIndex(keyframes=encoded_video, frame_idx=0, strength=strength))
-        return conditionings
-@torch.inference_mode()
-def main() -> None:
-    logging.getLogger().setLevel(logging.INFO)
-    parser = default_2_stage_distilled_arg_parser()
-    parser.add_argument(
-        "--video-conditioning",
-        action=VideoConditioningAction,
-        nargs=2,
-        metavar=("PATH", "STRENGTH"),
-        required=True,
-    )
-    args = parser.parse_args()
-    pipeline = ICLoraPipeline(
-        checkpoint_path=args.checkpoint_path,
-        spatial_upsampler_path=args.spatial_upsampler_path,
-        gemma_root=args.gemma_root,
-        loras=args.lora,
-        fp8transformer=args.enable_fp8,
-    )
-    tiling_config = TilingConfig.default()
-    video_chunks_number = get_video_chunks_number(args.num_frames, tiling_config)
-    video, audio = pipeline(
-        prompt=args.prompt,
-        seed=args.seed,
-        height=args.height,
-        width=args.width,
-        num_frames=args.num_frames,
-        frame_rate=args.frame_rate,
-        images=args.images,
-        video_conditioning=args.video_conditioning,
-        tiling_config=tiling_config,
-    )
-    encode_video(
-        video=video,
-        fps=args.frame_rate,
-        audio=audio,
-        audio_sample_rate=AUDIO_SAMPLE_RATE,
-        output_path=args.output_path,
-        video_chunks_number=video_chunks_number,
-    )
-if __name__ == "__main__":
-    main()

packages/ltx-pipelines/ltx_pipelines/keyframe_interpolation.py DELETED Viewed

@@ -1,273 +0,0 @@
-import logging
-from collections.abc import Iterator
-import torch
-from ltx_core.components.diffusion_steps import EulerDiffusionStep
-from ltx_core.components.guiders import CFGGuider
-from ltx_core.components.noisers import GaussianNoiser
-from ltx_core.components.protocols import DiffusionStepProtocol
-from ltx_core.components.schedulers import LTX2Scheduler
-from ltx_core.loader import LoraPathStrengthAndSDOps
-from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
-from ltx_core.model.upsampler import upsample_video
-from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
-from ltx_core.model.video_vae import decode_video as vae_decode_video
-from ltx_core.text_encoders.gemma import encode_text
-from ltx_core.types import LatentState, VideoPixelShape
-from ltx_pipelines.utils import ModelLedger
-from ltx_pipelines.utils.args import default_2_stage_arg_parser
-from ltx_pipelines.utils.constants import (
-    AUDIO_SAMPLE_RATE,
-    STAGE_2_DISTILLED_SIGMA_VALUES,
-)
-from ltx_pipelines.utils.helpers import (
-    assert_resolution,
-    cleanup_memory,
-    denoise_audio_video,
-    euler_denoising_loop,
-    generate_enhanced_prompt,
-    get_device,
-    guider_denoising_func,
-    image_conditionings_by_adding_guiding_latent,
-    simple_denoising_func,
-)
-from ltx_pipelines.utils.media_io import encode_video
-from ltx_pipelines.utils.types import PipelineComponents
-device = get_device()
-class KeyframeInterpolationPipeline:
-    """
-    Keyframe-based Two-stage video interpolation pipeline.
-    Interpolates between keyframes to generate a video with smoother transitions.
-    Stage 1 generates video at the target resolution, then Stage 2 upsamples
-    by 2x and refines with additional denoising steps for higher quality output.
-    """
-    def __init__(
-        self,
-        checkpoint_path: str,
-        distilled_lora: list[LoraPathStrengthAndSDOps],
-        spatial_upsampler_path: str,
-        gemma_root: str,
-        loras: list[LoraPathStrengthAndSDOps],
-        device: torch.device = device,
-        fp8transformer: bool = False,
-    ):
-        self.device = device
-        self.dtype = torch.bfloat16
-        self.stage_1_model_ledger = ModelLedger(
-            dtype=self.dtype,
-            device=device,
-            checkpoint_path=checkpoint_path,
-            spatial_upsampler_path=spatial_upsampler_path,
-            gemma_root_path=gemma_root,
-            loras=loras,
-            fp8transformer=fp8transformer,
-        )
-        self.stage_2_model_ledger = self.stage_1_model_ledger.with_loras(
-            loras=distilled_lora,
-        )
-        self.pipeline_components = PipelineComponents(
-            dtype=self.dtype,
-            device=device,
-        )
-    @torch.inference_mode()
-    def __call__(  # noqa: PLR0913
-        self,
-        prompt: str,
-        negative_prompt: str,
-        seed: int,
-        height: int,
-        width: int,
-        num_frames: int,
-        frame_rate: float,
-        num_inference_steps: int,
-        cfg_guidance_scale: float,
-        images: list[tuple[str, int, float]],
-        tiling_config: TilingConfig | None = None,
-        enhance_prompt: bool = False,
-    ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
-        assert_resolution(height=height, width=width, is_two_stage=True)
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        noiser = GaussianNoiser(generator=generator)
-        stepper = EulerDiffusionStep()
-        cfg_guider = CFGGuider(cfg_guidance_scale)
-        dtype = torch.bfloat16
-        text_encoder = self.stage_1_model_ledger.text_encoder()
-        if enhance_prompt:
-            prompt = generate_enhanced_prompt(
-                text_encoder, prompt, images[0][0] if len(images) > 0 else None, seed=seed
-            )
-        context_p, context_n = encode_text(text_encoder, prompts=[prompt, negative_prompt])
-        v_context_p, a_context_p = context_p
-        v_context_n, a_context_n = context_n
-        torch.cuda.synchronize()
-        del text_encoder
-        cleanup_memory()
-        # Stage 1: Initial low resolution video generation.
-        video_encoder = self.stage_1_model_ledger.video_encoder()
-        transformer = self.stage_1_model_ledger.transformer()
-        sigmas = LTX2Scheduler().execute(steps=num_inference_steps).to(dtype=torch.float32, device=self.device)
-        def first_stage_denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
-        ) -> tuple[LatentState, LatentState]:
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=guider_denoising_func(
-                    cfg_guider,
-                    v_context_p,
-                    v_context_n,
-                    a_context_p,
-                    a_context_n,
-                    transformer=transformer,  # noqa: F821
-                ),
-            )
-        stage_1_output_shape = VideoPixelShape(
-            batch=1,
-            frames=num_frames,
-            width=width // 2,
-            height=height // 2,
-            fps=frame_rate,
-        )
-        stage_1_conditionings = image_conditionings_by_adding_guiding_latent(
-            images=images,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
-            noiser=noiser,
-            sigmas=sigmas,
-            stepper=stepper,
-            denoising_loop_fn=first_stage_denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-        )
-        torch.cuda.synchronize()
-        del transformer
-        cleanup_memory()
-        # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
-        upscaled_video_latent = upsample_video(
-            latent=video_state.latent[:1],
-            video_encoder=video_encoder,
-            upsampler=self.stage_2_model_ledger.spatial_upsampler(),
-        )
-        torch.cuda.synchronize()
-        cleanup_memory()
-        transformer = self.stage_2_model_ledger.transformer()
-        distilled_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
-        def second_stage_denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
-        ) -> tuple[LatentState, LatentState]:
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=simple_denoising_func(
-                    video_context=v_context_p,
-                    audio_context=a_context_p,
-                    transformer=transformer,  # noqa: F821
-                ),
-            )
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_2_conditionings = image_conditionings_by_adding_guiding_latent(
-            images=images,
-            height=stage_2_output_shape.height,
-            width=stage_2_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_2_output_shape,
-            conditionings=stage_2_conditionings,
-            noiser=noiser,
-            sigmas=distilled_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=second_stage_denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            noise_scale=distilled_sigmas[0],
-            initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=audio_state.latent,
-        )
-        torch.cuda.synchronize()
-        del transformer
-        del video_encoder
-        cleanup_memory()
-        decoded_video = vae_decode_video(video_state.latent, self.stage_2_model_ledger.video_decoder(), tiling_config)
-        decoded_audio = vae_decode_audio(
-            audio_state.latent, self.stage_2_model_ledger.audio_decoder(), self.stage_2_model_ledger.vocoder()
-        )
-        return decoded_video, decoded_audio
-@torch.inference_mode()
-def main() -> None:
-    logging.getLogger().setLevel(logging.INFO)
-    parser = default_2_stage_arg_parser()
-    args = parser.parse_args()
-    pipeline = KeyframeInterpolationPipeline(
-        checkpoint_path=args.checkpoint_path,
-        distilled_lora=args.distilled_lora,
-        spatial_upsampler_path=args.spatial_upsampler_path,
-        gemma_root=args.gemma_root,
-        loras=args.lora,
-        fp8transformer=args.enable_fp8,
-    )
-    tiling_config = TilingConfig.default()
-    video_chunks_number = get_video_chunks_number(args.num_frames, tiling_config)
-    video, audio = pipeline(
-        prompt=args.prompt,
-        negative_prompt=args.negative_prompt,
-        seed=args.seed,
-        height=args.height,
-        width=args.width,
-        num_frames=args.num_frames,
-        frame_rate=args.frame_rate,
-        num_inference_steps=args.num_inference_steps,
-        cfg_guidance_scale=args.cfg_guidance_scale,
-        images=args.images,
-        tiling_config=tiling_config,
-    )
-    encode_video(
-        video=video,
-        fps=args.frame_rate,
-        audio=audio,
-        audio_sample_rate=AUDIO_SAMPLE_RATE,
-        output_path=args.output_path,
-        video_chunks_number=video_chunks_number,
-    )
-if __name__ == "__main__":
-    main()

packages/ltx-pipelines/ltx_pipelines/ti2vid_one_stage.py DELETED Viewed

@@ -1,193 +0,0 @@
-import logging
-from collections.abc import Iterator
-import torch
-from ltx_core.components.diffusion_steps import EulerDiffusionStep
-from ltx_core.components.guiders import CFGGuider
-from ltx_core.components.noisers import GaussianNoiser
-from ltx_core.components.protocols import DiffusionStepProtocol
-from ltx_core.components.schedulers import LTX2Scheduler
-from ltx_core.loader import LoraPathStrengthAndSDOps
-from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
-from ltx_core.model.video_vae import decode_video as vae_decode_video
-from ltx_core.text_encoders.gemma import encode_text
-from ltx_core.types import LatentState, VideoPixelShape
-from ltx_pipelines.utils import ModelLedger
-from ltx_pipelines.utils.args import default_1_stage_arg_parser
-from ltx_pipelines.utils.constants import AUDIO_SAMPLE_RATE
-from ltx_pipelines.utils.helpers import (
-    assert_resolution,
-    cleanup_memory,
-    denoise_audio_video,
-    euler_denoising_loop,
-    generate_enhanced_prompt,
-    get_device,
-    guider_denoising_func,
-    image_conditionings_by_replacing_latent,
-)
-from ltx_pipelines.utils.media_io import encode_video
-from ltx_pipelines.utils.types import PipelineComponents
-device = get_device()
-class TI2VidOneStagePipeline:
-    """
-    Single-stage text/image-to-video generation pipeline.
-    Generates video at the target resolution in a single diffusion pass with
-    classifier-free guidance (CFG). Supports optional image conditioning via
-    the images parameter.
-    """
-    def __init__(
-        self,
-        checkpoint_path: str,
-        gemma_root: str,
-        loras: list[LoraPathStrengthAndSDOps],
-        device: torch.device = device,
-        fp8transformer: bool = False,
-    ):
-        self.dtype = torch.bfloat16
-        self.device = device
-        self.model_ledger = ModelLedger(
-            dtype=self.dtype,
-            device=device,
-            checkpoint_path=checkpoint_path,
-            gemma_root_path=gemma_root,
-            loras=loras,
-            fp8transformer=fp8transformer,
-        )
-        self.pipeline_components = PipelineComponents(
-            dtype=self.dtype,
-            device=device,
-        )
-    def __call__(  # noqa: PLR0913
-        self,
-        prompt: str,
-        negative_prompt: str,
-        seed: int,
-        height: int,
-        width: int,
-        num_frames: int,
-        frame_rate: float,
-        num_inference_steps: int,
-        cfg_guidance_scale: float,
-        images: list[tuple[str, int, float]],
-        enhance_prompt: bool = False,
-    ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
-        assert_resolution(height=height, width=width, is_two_stage=False)
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        noiser = GaussianNoiser(generator=generator)
-        stepper = EulerDiffusionStep()
-        cfg_guider = CFGGuider(cfg_guidance_scale)
-        dtype = torch.bfloat16
-        text_encoder = self.model_ledger.text_encoder()
-        if enhance_prompt:
-            prompt = generate_enhanced_prompt(
-                text_encoder, prompt, images[0][0] if len(images) > 0 else None, seed=seed
-            )
-        context_p, context_n = encode_text(text_encoder, prompts=[prompt, negative_prompt])
-        v_context_p, a_context_p = context_p
-        v_context_n, a_context_n = context_n
-        torch.cuda.synchronize()
-        del text_encoder
-        cleanup_memory()
-        # Stage 1: Initial low resolution video generation.
-        video_encoder = self.model_ledger.video_encoder()
-        transformer = self.model_ledger.transformer()
-        sigmas = LTX2Scheduler().execute(steps=num_inference_steps).to(dtype=torch.float32, device=self.device)
-        def first_stage_denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
-        ) -> tuple[LatentState, LatentState]:
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=guider_denoising_func(
-                    cfg_guider,
-                    v_context_p,
-                    v_context_n,
-                    a_context_p,
-                    a_context_n,
-                    transformer=transformer,  # noqa: F821
-                ),
-            )
-        stage_1_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_1_conditionings = image_conditionings_by_replacing_latent(
-            images=images,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
-            noiser=noiser,
-            sigmas=sigmas,
-            stepper=stepper,
-            denoising_loop_fn=first_stage_denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-        )
-        torch.cuda.synchronize()
-        del transformer
-        cleanup_memory()
-        decoded_video = vae_decode_video(video_state.latent, self.model_ledger.video_decoder())
-        decoded_audio = vae_decode_audio(
-            audio_state.latent, self.model_ledger.audio_decoder(), self.model_ledger.vocoder()
-        )
-        return decoded_video, decoded_audio
-@torch.inference_mode()
-def main() -> None:
-    logging.getLogger().setLevel(logging.INFO)
-    parser = default_1_stage_arg_parser()
-    args = parser.parse_args()
-    pipeline = TI2VidOneStagePipeline(
-        checkpoint_path=args.checkpoint_path,
-        gemma_root=args.gemma_root,
-        loras=args.lora,
-        fp8transformer=args.enable_fp8,
-    )
-    video, audio = pipeline(
-        prompt=args.prompt,
-        negative_prompt=args.negative_prompt,
-        seed=args.seed,
-        height=args.height,
-        width=args.width,
-        num_frames=args.num_frames,
-        frame_rate=args.frame_rate,
-        num_inference_steps=args.num_inference_steps,
-        cfg_guidance_scale=args.cfg_guidance_scale,
-        images=args.images,
-    )
-    encode_video(
-        video=video,
-        fps=args.frame_rate,
-        audio=audio,
-        audio_sample_rate=AUDIO_SAMPLE_RATE,
-        output_path=args.output_path,
-        video_chunks_number=1,
-    )
-if __name__ == "__main__":
-    main()

packages/ltx-pipelines/ltx_pipelines/ti2vid_two_stages.py DELETED Viewed

@@ -1,276 +0,0 @@
-import logging
-from collections.abc import Iterator
-import torch
-from ltx_core.components.diffusion_steps import EulerDiffusionStep
-from ltx_core.components.guiders import CFGGuider
-from ltx_core.components.noisers import GaussianNoiser
-from ltx_core.components.protocols import DiffusionStepProtocol
-from ltx_core.components.schedulers import LTX2Scheduler
-from ltx_core.loader import LoraPathStrengthAndSDOps
-from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
-from ltx_core.model.upsampler import upsample_video
-from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
-from ltx_core.model.video_vae import decode_video as vae_decode_video
-from ltx_core.text_encoders.gemma import encode_text
-from ltx_core.types import LatentState, VideoPixelShape
-from ltx_pipelines.utils import ModelLedger
-from ltx_pipelines.utils.args import default_2_stage_arg_parser
-from ltx_pipelines.utils.constants import (
-    AUDIO_SAMPLE_RATE,
-    STAGE_2_DISTILLED_SIGMA_VALUES,
-)
-from ltx_pipelines.utils.helpers import (
-    assert_resolution,
-    cleanup_memory,
-    denoise_audio_video,
-    euler_denoising_loop,
-    generate_enhanced_prompt,
-    get_device,
-    guider_denoising_func,
-    image_conditionings_by_replacing_latent,
-    simple_denoising_func,
-)
-from ltx_pipelines.utils.media_io import encode_video
-from ltx_pipelines.utils.types import PipelineComponents
-device = get_device()
-class TI2VidTwoStagesPipeline:
-    """
-    Two-stage text/image-to-video generation pipeline.
-    Stage 1 generates video at the target resolution with CFG guidance, then
-    Stage 2 upsamples by 2x and refines using a distilled LoRA for higher
-    quality output. Supports optional image conditioning via the images parameter.
-    """
-    def __init__(
-        self,
-        checkpoint_path: str,
-        distilled_lora: list[LoraPathStrengthAndSDOps],
-        spatial_upsampler_path: str,
-        gemma_root: str,
-        loras: list[LoraPathStrengthAndSDOps],
-        device: str = device,
-        fp8transformer: bool = False,
-    ):
-        self.device = device
-        self.dtype = torch.bfloat16
-        self.stage_1_model_ledger = ModelLedger(
-            dtype=self.dtype,
-            device=device,
-            checkpoint_path=checkpoint_path,
-            gemma_root_path=gemma_root,
-            spatial_upsampler_path=spatial_upsampler_path,
-            loras=loras,
-            fp8transformer=fp8transformer,
-        )
-        self.stage_2_model_ledger = self.stage_1_model_ledger.with_loras(
-            loras=distilled_lora,
-        )
-        self.pipeline_components = PipelineComponents(
-            dtype=self.dtype,
-            device=device,
-        )
-    @torch.inference_mode()
-    def __call__(  # noqa: PLR0913
-        self,
-        prompt: str,
-        negative_prompt: str,
-        seed: int,
-        height: int,
-        width: int,
-        num_frames: int,
-        frame_rate: float,
-        num_inference_steps: int,
-        cfg_guidance_scale: float,
-        images: list[tuple[str, int, float]],
-        tiling_config: TilingConfig | None = None,
-        enhance_prompt: bool = False,
-    ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
-        assert_resolution(height=height, width=width, is_two_stage=True)
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        noiser = GaussianNoiser(generator=generator)
-        stepper = EulerDiffusionStep()
-        cfg_guider = CFGGuider(cfg_guidance_scale)
-        dtype = torch.bfloat16
-        text_encoder = self.stage_1_model_ledger.text_encoder()
-        if enhance_prompt:
-            prompt = generate_enhanced_prompt(
-                text_encoder, prompt, images[0][0] if len(images) > 0 else None, seed=seed
-            )
-        context_p, context_n = encode_text(text_encoder, prompts=[prompt, negative_prompt])
-        v_context_p, a_context_p = context_p
-        v_context_n, a_context_n = context_n
-        torch.cuda.synchronize()
-        del text_encoder
-        cleanup_memory()
-        # Stage 1: Initial low resolution video generation.
-        video_encoder = self.stage_1_model_ledger.video_encoder()
-        transformer = self.stage_1_model_ledger.transformer()
-        sigmas = LTX2Scheduler().execute(steps=num_inference_steps).to(dtype=torch.float32, device=self.device)
-        def first_stage_denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
-        ) -> tuple[LatentState, LatentState]:
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=guider_denoising_func(
-                    cfg_guider,
-                    v_context_p,
-                    v_context_n,
-                    a_context_p,
-                    a_context_n,
-                    transformer=transformer,  # noqa: F821
-                ),
-            )
-        stage_1_output_shape = VideoPixelShape(
-            batch=1,
-            frames=num_frames,
-            width=width // 2,
-            height=height // 2,
-            fps=frame_rate,
-        )
-        stage_1_conditionings = image_conditionings_by_replacing_latent(
-            images=images,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
-            noiser=noiser,
-            sigmas=sigmas,
-            stepper=stepper,
-            denoising_loop_fn=first_stage_denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-        )
-        torch.cuda.synchronize()
-        del transformer
-        cleanup_memory()
-        # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
-        upscaled_video_latent = upsample_video(
-            latent=video_state.latent[:1],
-            video_encoder=video_encoder,
-            upsampler=self.stage_2_model_ledger.spatial_upsampler(),
-        )
-        torch.cuda.synchronize()
-        cleanup_memory()
-        transformer = self.stage_2_model_ledger.transformer()
-        distilled_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
-        def second_stage_denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
-        ) -> tuple[LatentState, LatentState]:
-            return euler_denoising_loop(
-                sigmas=sigmas,
-                video_state=video_state,
-                audio_state=audio_state,
-                stepper=stepper,
-                denoise_fn=simple_denoising_func(
-                    video_context=v_context_p,
-                    audio_context=a_context_p,
-                    transformer=transformer,  # noqa: F821
-                ),
-            )
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_2_conditionings = image_conditionings_by_replacing_latent(
-            images=images,
-            height=stage_2_output_shape.height,
-            width=stage_2_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_2_output_shape,
-            conditionings=stage_2_conditionings,
-            noiser=noiser,
-            sigmas=distilled_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=second_stage_denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            noise_scale=distilled_sigmas[0],
-            initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=audio_state.latent,
-        )
-        torch.cuda.synchronize()
-        del transformer
-        del video_encoder
-        cleanup_memory()
-        decoded_video = vae_decode_video(video_state.latent, self.stage_2_model_ledger.video_decoder(), tiling_config)
-        decoded_audio = vae_decode_audio(
-            audio_state.latent, self.stage_2_model_ledger.audio_decoder(), self.stage_2_model_ledger.vocoder()
-        )
-        return decoded_video, decoded_audio
-@torch.inference_mode()
-def main() -> None:
-    logging.getLogger().setLevel(logging.INFO)
-    parser = default_2_stage_arg_parser()
-    args = parser.parse_args()
-    pipeline = TI2VidTwoStagesPipeline(
-        checkpoint_path=args.checkpoint_path,
-        distilled_lora=args.distilled_lora,
-        spatial_upsampler_path=args.spatial_upsampler_path,
-        gemma_root=args.gemma_root,
-        loras=args.lora,
-        fp8transformer=args.enable_fp8,
-    )
-    tiling_config = TilingConfig.default()
-    video_chunks_number = get_video_chunks_number(args.num_frames, tiling_config)
-    video, audio = pipeline(
-        prompt=args.prompt,
-        negative_prompt=args.negative_prompt,
-        seed=args.seed,
-        height=args.height,
-        width=args.width,
-        num_frames=args.num_frames,
-        frame_rate=args.frame_rate,
-        num_inference_steps=args.num_inference_steps,
-        cfg_guidance_scale=args.cfg_guidance_scale,
-        images=args.images,
-        tiling_config=tiling_config,
-    )
-    encode_video(
-        video=video,
-        fps=args.frame_rate,
-        audio=audio,
-        audio_sample_rate=AUDIO_SAMPLE_RATE,
-        output_path=args.output_path,
-        video_chunks_number=video_chunks_number,
-    )
-if __name__ == "__main__":
-    main()

packages/ltx-pipelines/ltx_pipelines/utils/args.py DELETED Viewed

@@ -1,277 +0,0 @@
-import argparse
-from pathlib import Path
-from ltx_core.loader import LTXV_LORA_COMFY_RENAMING_MAP, LoraPathStrengthAndSDOps
-from ltx_pipelines.utils.constants import (
-    DEFAULT_1_STAGE_HEIGHT,
-    DEFAULT_1_STAGE_WIDTH,
-    DEFAULT_2_STAGE_HEIGHT,
-    DEFAULT_2_STAGE_WIDTH,
-    DEFAULT_CFG_GUIDANCE_SCALE,
-    DEFAULT_FRAME_RATE,
-    DEFAULT_LORA_STRENGTH,
-    DEFAULT_NEGATIVE_PROMPT,
-    DEFAULT_NUM_FRAMES,
-    DEFAULT_NUM_INFERENCE_STEPS,
-    DEFAULT_SEED,
-)
-class VideoConditioningAction(argparse.Action):
-    def __call__(
-        self,
-        parser: argparse.ArgumentParser,  # noqa: ARG002
-        namespace: argparse.Namespace,
-        values: list[str],
-        option_string: str | None = None,  # noqa: ARG002
-    ) -> None:
-        path, strength_str = values
-        resolved_path = resolve_path(path)
-        strength = float(strength_str)
-        current = getattr(namespace, self.dest) or []
-        current.append((resolved_path, strength))
-        setattr(namespace, self.dest, current)
-class ImageAction(argparse.Action):
-    def __call__(
-        self,
-        parser: argparse.ArgumentParser,  # noqa: ARG002
-        namespace: argparse.Namespace,
-        values: list[str],
-        option_string: str | None = None,  # noqa: ARG002
-    ) -> None:
-        path, frame_idx, strength_str = values
-        resolved_path = resolve_path(path)
-        frame_idx = int(frame_idx)
-        strength = float(strength_str)
-        current = getattr(namespace, self.dest) or []
-        current.append((resolved_path, frame_idx, strength))
-        setattr(namespace, self.dest, current)
-class LoraAction(argparse.Action):
-    def __call__(
-        self,
-        parser: argparse.ArgumentParser,  # noqa: ARG002
-        namespace: argparse.Namespace,
-        values: list[str],
-        option_string: str | None = None,
-    ) -> None:
-        if len(values) > 2:
-            msg = f"{option_string} accepts at most 2 arguments (PATH and optional STRENGTH), got {len(values)} values"
-            raise argparse.ArgumentError(self, msg)
-        path = values[0]
-        strength_str = values[1] if len(values) > 1 else str(DEFAULT_LORA_STRENGTH)
-        resolved_path = resolve_path(path)
-        strength = float(strength_str)
-        current = getattr(namespace, self.dest) or []
-        current.append(LoraPathStrengthAndSDOps(resolved_path, strength, LTXV_LORA_COMFY_RENAMING_MAP))
-        setattr(namespace, self.dest, current)
-def resolve_path(path: str) -> str:
-    return str(Path(path).expanduser().resolve().as_posix())
-def basic_arg_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--checkpoint-path",
-        type=resolve_path,
-        required=True,
-        help="Path to LTX-2 model checkpoint (.safetensors file).",
-    )
-    parser.add_argument(
-        "--gemma-root",
-        type=resolve_path,
-        required=True,
-        help="Path to the root directory containing the Gemma text encoder model files.",
-    )
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        required=True,
-        help="Text prompt describing the desired video content to be generated by the model.",
-    )
-    parser.add_argument(
-        "--output-path",
-        type=resolve_path,
-        required=True,
-        help="Path to the output video file (MP4 format).",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=DEFAULT_SEED,
-        help=(
-            f"Random seed value used to initialize the noise tensor for "
-            f"reproducible generation (default: {DEFAULT_SEED})."
-        ),
-    )
-    parser.add_argument(
-        "--height",
-        type=int,
-        default=DEFAULT_1_STAGE_HEIGHT,
-        help=f"Height of the generated video in pixels, should be divisible by 32 (default: {DEFAULT_1_STAGE_HEIGHT}).",
-    )
-    parser.add_argument(
-        "--width",
-        type=int,
-        default=DEFAULT_1_STAGE_WIDTH,
-        help=f"Width of the generated video in pixels, should be divisible by 32 (default: {DEFAULT_1_STAGE_WIDTH}).",
-    )
-    parser.add_argument(
-        "--num-frames",
-        type=int,
-        default=DEFAULT_NUM_FRAMES,
-        help=f"Number of frames to generate in the output video sequence, num-frames = (8 x K) + 1, "
-        f"where k is a non-negative integer (default: {DEFAULT_NUM_FRAMES}).",
-    )
-    parser.add_argument(
-        "--frame-rate",
-        type=float,
-        default=DEFAULT_FRAME_RATE,
-        help=f"Frame rate of the generated video (fps) (default: {DEFAULT_FRAME_RATE}).",
-    )
-    parser.add_argument(
-        "--num-inference-steps",
-        type=int,
-        default=DEFAULT_NUM_INFERENCE_STEPS,
-        help=(
-            f"Number of denoising steps in the diffusion sampling process. "
-            f"Higher values improve quality but increase generation time (default: {DEFAULT_NUM_INFERENCE_STEPS})."
-        ),
-    )
-    parser.add_argument(
-        "--image",
-        dest="images",
-        action=ImageAction,
-        nargs=3,
-        metavar=("PATH", "FRAME_IDX", "STRENGTH"),
-        default=[],
-        help=(
-            "Image conditioning input: path to image file, target frame index, "
-            "and conditioning strength (all three required). Default: empty list [] (no image conditioning). "
-            "Can be specified multiple times. Example: --image path/to/image1.jpg 0 0.8 "
-            "--image path/to/image2.jpg 160 0.9"
-        ),
-    )
-    parser.add_argument(
-        "--lora",
-        dest="lora",
-        action=LoraAction,
-        nargs="+",  # Accept 1-2 arguments per use (path and optional strength); validation is handled in LoraAction
-        metavar=("PATH", "STRENGTH"),
-        default=[],
-        help=(
-            "LoRA (Low-Rank Adaptation) model: path to model file and optional strength "
-            f"(default strength: {DEFAULT_LORA_STRENGTH}). Can be specified multiple times. "
-            "Example: --lora path/to/lora1.safetensors 0.8 --lora path/to/lora2.safetensors"
-        ),
-    )
-    parser.add_argument(
-        "--enable-fp8",
-        action="store_true",
-        help="Enable FP8 mode to reduce memory footprint by keeping model in lower precision. "
-        "Note that calculations are still performed in bfloat16 precision.",
-    )
-    parser.add_argument("--enhance-prompt", action="store_true")
-    return parser
-def default_1_stage_arg_parser() -> argparse.ArgumentParser:
-    parser = basic_arg_parser()
-    parser.add_argument(
-        "--cfg-guidance-scale",
-        type=float,
-        default=DEFAULT_CFG_GUIDANCE_SCALE,
-        help=(
-            f"Classifier-free guidance (CFG) scale controlling how strongly "
-            f"the model adheres to the prompt. Higher values increase prompt "
-            f"adherence but may reduce diversity (default: {DEFAULT_CFG_GUIDANCE_SCALE})."
-        ),
-    )
-    parser.add_argument(
-        "--negative-prompt",
-        type=str,
-        default=DEFAULT_NEGATIVE_PROMPT,
-        help=(
-            "Negative prompt describing what should not appear in the generated video, "
-            "used to guide the diffusion process away from unwanted content. "
-            "Default: a comprehensive negative prompt covering common artifacts and quality issues."
-        ),
-    )
-    return parser
-def default_2_stage_arg_parser() -> argparse.ArgumentParser:
-    parser = default_1_stage_arg_parser()
-    parser.set_defaults(height=DEFAULT_2_STAGE_HEIGHT, width=DEFAULT_2_STAGE_WIDTH)
-    # Update help text to reflect 2-stage defaults
-    for action in parser._actions:
-        if "--height" in action.option_strings:
-            action.help = (
-                f"Height of the generated video in pixels, should be divisible by 64 "
-                f"(default: {DEFAULT_2_STAGE_HEIGHT})."
-            )
-        if "--width" in action.option_strings:
-            action.help = (
-                f"Width of the generated video in pixels, should be divisible by 64 (default: {DEFAULT_2_STAGE_WIDTH})."
-            )
-    parser.add_argument(
-        "--distilled-lora",
-        dest="distilled_lora",
-        action=LoraAction,
-        nargs="+",  # Accept 1-2 arguments per use (path and optional strength); validation is handled in LoraAction
-        metavar=("PATH", "STRENGTH"),
-        required=True,
-        help=(
-            "Distilled LoRA (Low-Rank Adaptation) model used in the second stage (upscaling and refinement): "
-            f"path to model file and optional strength (default strength: {DEFAULT_LORA_STRENGTH}). "
-            "The second stage upsamples the video by 2x resolution and refines it using a distilled "
-            "denoising schedule (fewer steps, no CFG). The distilled LoRA is specifically trained "
-            "for this refinement process to improve quality at higher resolutions. "
-            "Example: --distilled-lora path/to/distilled_lora.safetensors 0.8"
-        ),
-    )
-    parser.add_argument(
-        "--spatial-upsampler-path",
-        type=resolve_path,
-        required=True,
-        help=(
-            "Path to the spatial upsampler model used to increase the resolution "
-            "of the generated video in the latent space."
-        ),
-    )
-    return parser
-def default_2_stage_distilled_arg_parser() -> argparse.ArgumentParser:
-    parser = basic_arg_parser()
-    parser.set_defaults(height=DEFAULT_2_STAGE_HEIGHT, width=DEFAULT_2_STAGE_WIDTH)
-    # Update help text to reflect 2-stage defaults
-    for action in parser._actions:
-        if "--height" in action.option_strings:
-            action.help = (
-                f"Height of the generated video in pixels, should be divisible by 64 "
-                f"(default: {DEFAULT_2_STAGE_HEIGHT})."
-            )
-        if "--width" in action.option_strings:
-            action.help = (
-                f"Width of the generated video in pixels, should be divisible by 64 (default: {DEFAULT_2_STAGE_WIDTH})."
-            )
-    parser.add_argument(
-        "--spatial-upsampler-path",
-        type=resolve_path,
-        required=True,
-        help=(
-            "Path to the spatial upsampler model used to increase the resolution "
-            "of the generated video in the latent space."
-        ),
-    )
-    return parser

packages/ltx-pipelines/ltx_pipelines/utils/constants.py DELETED Viewed

@@ -1,77 +0,0 @@
-# =============================================================================
-# Diffusion Schedule
-# =============================================================================
-# Noise schedule for the distilled pipeline. These sigma values control noise
-# levels at each denoising step and were tuned to match the distillation process.
-from ltx_core.types import SpatioTemporalScaleFactors
-DISTILLED_SIGMA_VALUES = [1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0]
-# Reduced schedule for super-resolution stage 2 (subset of distilled values)
-STAGE_2_DISTILLED_SIGMA_VALUES = [0.909375, 0.725, 0.421875, 0.0]
-# =============================================================================
-# Video Generation Defaults
-# =============================================================================
-DEFAULT_SEED = 10
-DEFAULT_1_STAGE_HEIGHT = 512
-DEFAULT_1_STAGE_WIDTH = 768
-DEFAULT_2_STAGE_HEIGHT = DEFAULT_1_STAGE_HEIGHT * 2
-DEFAULT_2_STAGE_WIDTH = DEFAULT_1_STAGE_WIDTH * 2
-DEFAULT_NUM_FRAMES = 121
-DEFAULT_FRAME_RATE = 24.0
-DEFAULT_NUM_INFERENCE_STEPS = 40
-DEFAULT_CFG_GUIDANCE_SCALE = 4.0
-# =============================================================================
-# Audio
-# =============================================================================
-AUDIO_SAMPLE_RATE = 24000
-# =============================================================================
-# LoRA
-# =============================================================================
-DEFAULT_LORA_STRENGTH = 1.0
-# =============================================================================
-# Video VAE Architecture
-# =============================================================================
-VIDEO_SCALE_FACTORS = SpatioTemporalScaleFactors.default()
-VIDEO_LATENT_CHANNELS = 128
-# =============================================================================
-# Image Preprocessing
-# =============================================================================
-# CRF (Constant Rate Factor) for H.264 encoding used in image conditioning.
-# Lower = higher quality, 0 = lossless. This mimics compression artifacts.
-DEFAULT_IMAGE_CRF = 33
-# =============================================================================
-# Prompts
-# =============================================================================
-DEFAULT_NEGATIVE_PROMPT = (
-    "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
-    "grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
-    "deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
-    "wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
-    "field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
-    "lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
-    "valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
-    "mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
-    "off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
-    "pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
-    "inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
-)

packages/ltx-pipelines/ltx_pipelines/utils/helpers.py DELETED Viewed

@@ -1,507 +0,0 @@
-import gc
-import logging
-from dataclasses import replace
-import torch
-from tqdm import tqdm
-from ltx_core.components.noisers import Noiser
-from ltx_core.components.protocols import DiffusionStepProtocol, GuiderProtocol
-from ltx_core.conditioning import (
-    ConditioningItem,
-    VideoConditionByKeyframeIndex,
-    VideoConditionByLatentIndex,
-)
-from ltx_core.model.transformer import Modality, X0Model
-from ltx_core.model.video_vae import VideoEncoder
-from ltx_core.text_encoders.gemma import GemmaTextEncoderModelBase
-from ltx_core.tools import AudioLatentTools, LatentTools, VideoLatentTools
-from ltx_core.types import AudioLatentShape, LatentState, VideoLatentShape, VideoPixelShape
-from ltx_core.utils import to_denoised, to_velocity
-from ltx_pipelines.utils.media_io import decode_image, load_image_conditioning, resize_aspect_ratio_preserving
-from ltx_pipelines.utils.types import (
-    DenoisingFunc,
-    DenoisingLoopFunc,
-    PipelineComponents,
-)
-def get_device() -> torch.device:
-    if torch.cuda.is_available():
-        return torch.device("cuda")
-    return torch.device("cpu")
-def cleanup_memory() -> None:
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.synchronize()
-def image_conditionings_by_replacing_latent(
-    images: list[tuple[str, int, float]],
-    height: int,
-    width: int,
-    video_encoder: VideoEncoder,
-    dtype: torch.dtype,
-    device: torch.device,
-) -> list[ConditioningItem]:
-    conditionings = []
-    for image_path, frame_idx, strength in images:
-        image = load_image_conditioning(
-            image_path=image_path,
-            height=height,
-            width=width,
-            dtype=dtype,
-            device=device,
-        )
-        encoded_image = video_encoder(image)
-        conditionings.append(
-            VideoConditionByLatentIndex(
-                latent=encoded_image,
-                strength=strength,
-                latent_idx=frame_idx,
-            )
-        )
-    return conditionings
-def image_conditionings_by_adding_guiding_latent(
-    images: list[tuple[str, int, float]],
-    height: int,
-    width: int,
-    video_encoder: VideoEncoder,
-    dtype: torch.dtype,
-    device: torch.device,
-) -> list[ConditioningItem]:
-    conditionings = []
-    for image_path, frame_idx, strength in images:
-        image = load_image_conditioning(
-            image_path=image_path,
-            height=height,
-            width=width,
-            dtype=dtype,
-            device=device,
-        )
-        encoded_image = video_encoder(image)
-        conditionings.append(
-            VideoConditionByKeyframeIndex(keyframes=encoded_image, frame_idx=frame_idx, strength=strength)
-        )
-    return conditionings
-def euler_denoising_loop(
-    sigmas: torch.Tensor,
-    video_state: LatentState,
-    audio_state: LatentState,
-    stepper: DiffusionStepProtocol,
-    denoise_fn: DenoisingFunc,
-) -> tuple[LatentState, LatentState]:
-    """
-    Perform the joint audio-video denoising loop over a diffusion schedule.
-    This function iterates over all but the final value in ``sigmas`` and, at
-    each diffusion step, calls ``denoise_fn`` to obtain denoised video and
-    audio latents. The denoised latents are post-processed with their
-    respective denoise masks and clean latents, then passed to ``stepper`` to
-    advance the noisy latents one step along the diffusion schedule.
-    ### Parameters
-    sigmas:
-        A 1D tensor of noise levels (diffusion sigmas) defining the sampling
-        schedule. All steps except the last element are iterated over.
-    video_state:
-        The current video :class:`LatentState`, containing the noisy latent,
-        its clean reference latent, and the denoising mask.
-    audio_state:
-        The current audio :class:`LatentState`, analogous to ``video_state``
-        but for the audio modality.
-    stepper:
-        An implementation of :class:`DiffusionStepProtocol` that updates a
-        latent given the current latent, its denoised estimate, the full
-        ``sigmas`` schedule, and the current step index.
-    denoise_fn:
-        A callable implementing :class:`DenoisingFunc`. It is invoked as
-        ``denoise_fn(video_state, audio_state, sigmas, step_index)`` and must
-        return a tuple ``(denoised_video, denoised_audio)``, where each element
-        is a tensor with the same shape as the corresponding latent.
-    ### Returns
-    tuple[LatentState, LatentState]
-        A pair ``(video_state, audio_state)`` containing the final video and
-        audio latent states after completing the denoising loop.
-    """
-    for step_idx, _ in enumerate(tqdm(sigmas[:-1])):
-        denoised_video, denoised_audio = denoise_fn(video_state, audio_state, sigmas, step_idx)
-        denoised_video = post_process_latent(denoised_video, video_state.denoise_mask, video_state.clean_latent)
-        denoised_audio = post_process_latent(denoised_audio, audio_state.denoise_mask, audio_state.clean_latent)
-        video_state = replace(video_state, latent=stepper.step(video_state.latent, denoised_video, sigmas, step_idx))
-        audio_state = replace(audio_state, latent=stepper.step(audio_state.latent, denoised_audio, sigmas, step_idx))
-    return (video_state, audio_state)
-def gradient_estimating_euler_denoising_loop(
-    sigmas: torch.Tensor,
-    video_state: LatentState,
-    audio_state: LatentState,
-    stepper: DiffusionStepProtocol,
-    denoise_fn: DenoisingFunc,
-    ge_gamma: float = 2.0,
-) -> tuple[LatentState, LatentState]:
-    """
-    Perform the joint audio-video denoising loop using gradient-estimation sampling.
-    This function is similar to :func:`euler_denoising_loop`, but applies
-    gradient estimation to improve the denoised estimates by tracking velocity
-    changes across steps. See the referenced function for detailed parameter
-    documentation.
-    ### Parameters
-    ge_gamma:
-        Gradient estimation coefficient controlling the velocity correction term.
-        Default is 2.0. Paper: https://openreview.net/pdf?id=o2ND9v0CeK
-    sigmas, video_state, audio_state, stepper, denoise_fn:
-        See :func:`euler_denoising_loop` for parameter descriptions.
-    ### Returns
-    tuple[LatentState, LatentState]
-        See :func:`euler_denoising_loop` for return value description.
-    """
-    previous_audio_velocity = None
-    previous_video_velocity = None
-    def update_velocity_and_sample(
-        noisy_sample: torch.Tensor, denoised_sample: torch.Tensor, sigma: float, previous_velocity: torch.Tensor | None
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        current_velocity = to_velocity(noisy_sample, sigma, denoised_sample)
-        if previous_velocity is not None:
-            delta_v = current_velocity - previous_velocity
-            total_velocity = ge_gamma * delta_v + previous_velocity
-            denoised_sample = to_denoised(noisy_sample, total_velocity, sigma)
-        return current_velocity, denoised_sample
-    for step_idx, _ in enumerate(tqdm(sigmas[:-1])):
-        denoised_video, denoised_audio = denoise_fn(video_state, audio_state, sigmas, step_idx)
-        denoised_video = post_process_latent(denoised_video, video_state.denoise_mask, video_state.clean_latent)
-        denoised_audio = post_process_latent(denoised_audio, audio_state.denoise_mask, audio_state.clean_latent)
-        if sigmas[step_idx + 1] == 0:
-            return replace(video_state, latent=denoised_video), replace(audio_state, latent=denoised_audio)
-        previous_video_velocity, denoised_video = update_velocity_and_sample(
-            video_state.latent, denoised_video, sigmas[step_idx], previous_video_velocity
-        )
-        previous_audio_velocity, denoised_audio = update_velocity_and_sample(
-            audio_state.latent, denoised_audio, sigmas[step_idx], previous_audio_velocity
-        )
-        video_state = replace(video_state, latent=stepper.step(video_state.latent, denoised_video, sigmas, step_idx))
-        audio_state = replace(audio_state, latent=stepper.step(audio_state.latent, denoised_audio, sigmas, step_idx))
-    return (video_state, audio_state)
-def noise_video_state(
-    output_shape: VideoPixelShape,
-    noiser: Noiser,
-    conditionings: list[ConditioningItem],
-    components: PipelineComponents,
-    dtype: torch.dtype,
-    device: torch.device,
-    noise_scale: float = 1.0,
-    initial_latent: torch.Tensor | None = None,
-) -> tuple[LatentState, VideoLatentTools]:
-    """Initialize and noise a video latent state for the diffusion pipeline.
-    Creates a video latent state from the output shape, applies conditionings,
-    and adds noise using the provided noiser. Returns the noised state and
-    video latent tools for further processing. If initial_latent is provided, it will be used to create the initial
-    state, otherwise an empty initial state will be created.
-    """
-    video_latent_shape = VideoLatentShape.from_pixel_shape(
-        shape=output_shape,
-        latent_channels=components.video_latent_channels,
-        scale_factors=components.video_scale_factors,
-    )
-    video_tools = VideoLatentTools(components.video_patchifier, video_latent_shape, output_shape.fps)
-    video_state = create_noised_state(
-        tools=video_tools,
-        conditionings=conditionings,
-        noiser=noiser,
-        dtype=dtype,
-        device=device,
-        noise_scale=noise_scale,
-        initial_latent=initial_latent,
-    )
-    return video_state, video_tools
-def noise_audio_state(
-    output_shape: VideoPixelShape,
-    noiser: Noiser,
-    conditionings: list[ConditioningItem],
-    components: PipelineComponents,
-    dtype: torch.dtype,
-    device: torch.device,
-    noise_scale: float = 1.0,
-    initial_latent: torch.Tensor | None = None,
-    denoise_mask: torch.Tensor | None = None
-) -> tuple[LatentState, AudioLatentTools]:
-    """Initialize and noise an audio latent state for the diffusion pipeline.
-    Creates an audio latent state from the output shape, applies conditionings,
-    and adds noise using the provided noiser. Returns the noised state and
-    audio latent tools for further processing. If initial_latent is provided, it will be used to create the initial
-    state, otherwise an empty initial state will be created.
-    """
-    audio_latent_shape = AudioLatentShape.from_video_pixel_shape(output_shape)
-    audio_tools = AudioLatentTools(components.audio_patchifier, audio_latent_shape)
-    audio_state = create_noised_state(
-        tools=audio_tools,
-        conditionings=conditionings,
-        noiser=noiser,
-        dtype=dtype,
-        device=device,
-        noise_scale=noise_scale,
-        initial_latent=initial_latent,
-        denoise_mask=denoise_mask,
-    )
-    return audio_state, audio_tools
-def create_noised_state(
-    tools: LatentTools,
-    conditionings: list[ConditioningItem],
-    noiser: Noiser,
-    dtype: torch.dtype,
-    device: torch.device,
-    noise_scale: float = 1.0,
-    initial_latent: torch.Tensor | None = None,
-    denoise_mask: torch.Tensor | None = None,   # <-- add
-) -> LatentState:
-    state = tools.create_initial_state(device, dtype, initial_latent)
-    state = state_with_conditionings(state, conditionings, tools)
-    if denoise_mask is not None:
-        # Convert any tensor mask into a single scalar (solid mask behavior)
-        if isinstance(denoise_mask, torch.Tensor):
-            mask_value = float(denoise_mask.mean().item())
-        else:
-            mask_value = float(denoise_mask)
-        state = replace(
-            state,
-            clean_latent=state.latent.clone(),
-            denoise_mask=torch.full_like(state.denoise_mask, mask_value),  # <- matches internal shape
-        )
-    state = noiser(state, noise_scale)
-    if denoise_mask is not None:
-        m = state.denoise_mask.to(dtype=state.latent.dtype, device=state.latent.device)
-        clean = state.clean_latent.to(dtype=state.latent.dtype, device=state.latent.device)
-        state = replace(state, latent=state.latent * m + clean * (1 - m))
-    return state
-def state_with_conditionings(
-    latent_state: LatentState, conditioning_items: list[ConditioningItem], latent_tools: LatentTools
-) -> LatentState:
-    """Apply a list of conditionings to a latent state.
-    Iterates through the conditioning items and applies each one to the latent
-    state in sequence. Returns the modified state with all conditionings applied.
-    """
-    for conditioning in conditioning_items:
-        latent_state = conditioning.apply_to(latent_state=latent_state, latent_tools=latent_tools)
-    return latent_state
-def post_process_latent(denoised: torch.Tensor, denoise_mask: torch.Tensor, clean: torch.Tensor) -> torch.Tensor:
-    """Blend denoised output with clean state based on mask."""
-    clean = clean.to(dtype=denoised.dtype)
-    denoise_mask = denoise_mask.to(dtype=denoised.dtype)
-    return denoised * denoise_mask + clean * (1 - denoise_mask)
-def modality_from_latent_state(
-    state: LatentState, context: torch.Tensor, sigma: float | torch.Tensor, enabled: bool = True
-) -> Modality:
-    """Create a Modality from a latent state.
-    Constructs a Modality object with the latent state's data, timesteps derived
-    from the denoise mask and sigma, positions, and the provided context.
-    """
-    return Modality(
-        enabled=enabled,
-        latent=state.latent,
-        timesteps=timesteps_from_mask(state.denoise_mask, sigma),
-        positions=state.positions,
-        context=context,
-        context_mask=None,
-    )
-def timesteps_from_mask(denoise_mask: torch.Tensor, sigma: float | torch.Tensor) -> torch.Tensor:
-    """Compute timesteps from a denoise mask and sigma value.
-    Multiplies the denoise mask by sigma to produce timesteps for each position
-    in the latent state. Areas where the mask is 0 will have zero timesteps.
-    """
-    return denoise_mask * sigma
-def simple_denoising_func(
-    video_context: torch.Tensor, audio_context: torch.Tensor, transformer: X0Model
-) -> DenoisingFunc:
-    def simple_denoising_step(
-        video_state: LatentState, audio_state: LatentState, sigmas: torch.Tensor, step_index: int
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        sigma = sigmas[step_index]
-        pos_video = modality_from_latent_state(video_state, video_context, sigma)
-        pos_audio = modality_from_latent_state(audio_state, audio_context, sigma)
-        denoised_video, denoised_audio = transformer(video=pos_video, audio=pos_audio, perturbations=None)
-        return denoised_video, denoised_audio
-    return simple_denoising_step
-def guider_denoising_func(
-    guider: GuiderProtocol,
-    v_context_p: torch.Tensor,
-    v_context_n: torch.Tensor,
-    a_context_p: torch.Tensor,
-    a_context_n: torch.Tensor,
-    transformer: X0Model,
-) -> DenoisingFunc:
-    def guider_denoising_step(
-        video_state: LatentState, audio_state: LatentState, sigmas: torch.Tensor, step_index: int
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        sigma = sigmas[step_index]
-        pos_video = modality_from_latent_state(video_state, v_context_p, sigma)
-        pos_audio = modality_from_latent_state(audio_state, a_context_p, sigma)
-        denoised_video, denoised_audio = transformer(video=pos_video, audio=pos_audio, perturbations=None)
-        if guider.enabled():
-            neg_video = modality_from_latent_state(video_state, v_context_n, sigma)
-            neg_audio = modality_from_latent_state(audio_state, a_context_n, sigma)
-            neg_denoised_video, neg_denoised_audio = transformer(video=neg_video, audio=neg_audio, perturbations=None)
-            denoised_video = denoised_video + guider.delta(denoised_video, neg_denoised_video)
-            denoised_audio = denoised_audio + guider.delta(denoised_audio, neg_denoised_audio)
-        return denoised_video, denoised_audio
-    return guider_denoising_step
-def denoise_audio_video(  # noqa: PLR0913
-    output_shape: VideoPixelShape,
-    conditionings: list[ConditioningItem],
-    noiser: Noiser,
-    sigmas: torch.Tensor,
-    stepper: DiffusionStepProtocol,
-    denoising_loop_fn: DenoisingLoopFunc,
-    components: PipelineComponents,
-    dtype: torch.dtype,
-    device: torch.device,
-    audio_conditionings: list[ConditioningItem] | None = None,
-    noise_scale: float = 1.0,
-    initial_video_latent: torch.Tensor | None = None,
-    initial_audio_latent: torch.Tensor | None = None,
-    # mask_context: MaskInjection | None = None,
-) -> tuple[LatentState | None, LatentState | None]:
-    video_state, video_tools = noise_video_state(
-        output_shape=output_shape,
-        noiser=noiser,
-        conditionings=conditionings,
-        components=components,
-        dtype=dtype,
-        device=device,
-        noise_scale=noise_scale,
-        initial_latent=initial_video_latent,
-    )
-    audio_state, audio_tools = noise_audio_state(
-        output_shape=output_shape,
-        noiser=noiser,
-        conditionings=audio_conditionings or [],
-        components=components,
-        dtype=dtype,
-        device=device,
-        noise_scale=noise_scale,
-        initial_latent=initial_audio_latent,
-    )
-    loop_kwargs = {}
-    # if "preview_tools" in inspect.signature(denoising_loop_fn).parameters:
-    #     loop_kwargs["preview_tools"] = video_tools
-    # if "mask_context" in inspect.signature(denoising_loop_fn).parameters:
-    #     loop_kwargs["mask_context"] = mask_context
-    video_state, audio_state = denoising_loop_fn(
-        sigmas,
-        video_state,
-        audio_state,
-        stepper,
-        **loop_kwargs,
-    )
-    if video_state is None or audio_state is None:
-        return None, None
-    video_state = video_tools.clear_conditioning(video_state)
-    video_state = video_tools.unpatchify(video_state)
-    audio_state = audio_tools.clear_conditioning(audio_state)
-    audio_state = audio_tools.unpatchify(audio_state)
-    return video_state, audio_state
-_UNICODE_REPLACEMENTS = str.maketrans("\u2018\u2019\u201c\u201d\u2014\u2013\u00a0\u2032\u2212", "''\"\"-- '-")
-def clean_response(text: str) -> str:
-    """Clean a response from curly quotes and leading non-letter characters which Gemma tends to insert."""
-    text = text.translate(_UNICODE_REPLACEMENTS)
-    # Remove leading non-letter characters
-    for i, char in enumerate(text):
-        if char.isalpha():
-            return text[i:]
-    return text
-def generate_enhanced_prompt(
-    text_encoder: GemmaTextEncoderModelBase,
-    prompt: str,
-    image_path: str | None = None,
-    image_long_side: int = 896,
-    seed: int = 42,
-) -> str:
-    """Generate an enhanced prompt from a text encoder and a prompt."""
-    image = None
-    if image_path:
-        image = decode_image(image_path=image_path)
-        image = torch.tensor(image)
-        image = resize_aspect_ratio_preserving(image, image_long_side).to(torch.uint8)
-        prompt = text_encoder.enhance_i2v(prompt, image, seed=seed)
-    else:
-        prompt = text_encoder.enhance_t2v(prompt, seed=seed)
-    logging.info(f"Enhanced prompt: {prompt}")
-    return clean_response(prompt)
-def assert_resolution(height: int, width: int, is_two_stage: bool) -> None:
-    """Assert that the resolution is divisible by the required divisor.
-    For two-stage pipelines, the resolution must be divisible by 64.
-    For one-stage pipelines, the resolution must be divisible by 32.
-    """
-    divisor = 64 if is_two_stage else 32
-    if height % divisor != 0 or width % divisor != 0:
-        raise ValueError(
-            f"Resolution ({height}x{width}) is not divisible by {divisor}. "
-            f"For {'two-stage' if is_two_stage else 'one-stage'} pipelines, "
-            f"height and width must be multiples of {divisor}."
-        )

packages/ltx-pipelines/ltx_pipelines/utils/media_io.py DELETED Viewed

@@ -1,299 +0,0 @@
-import math
-from collections.abc import Generator, Iterator
-from fractions import Fraction
-from io import BytesIO
-import av
-import numpy as np
-import torch
-from einops import rearrange
-from PIL import Image
-from torch._prims_common import DeviceLikeType
-from tqdm import tqdm
-from ltx_pipelines.utils.constants import DEFAULT_IMAGE_CRF
-def resize_aspect_ratio_preserving(image: torch.Tensor, long_side: int) -> torch.Tensor:
-    """
-    Resize image preserving aspect ratio (filling target long side).
-    Preserves the input dimensions order.
-    Args:
-        image: Input image tensor with shape (F (optional), H, W, C)
-        long_side: Target long side size.
-    Returns:
-        Tensor with shape (F (optional), H, W, C) F = 1 if input is 3D, otherwise input shape[0]
-    """
-    height, width = image.shape[-3:2]
-    max_side = max(height, width)
-    scale = long_side / float(max_side)
-    target_height = int(height * scale)
-    target_width = int(width * scale)
-    resized = resize_and_center_crop(image, target_height, target_width)
-    # rearrange and remove batch dimension
-    result = rearrange(resized, "b c f h w -> b f h w c")[0]
-    # preserve input dimensions
-    return result[0] if result.shape[0] == 1 else result
-def resize_and_center_crop(tensor: torch.Tensor, height: int, width: int) -> torch.Tensor:
-    """
-    Resize tensor preserving aspect ratio (filling target), then center crop to exact dimensions.
-    Args:
-        latent: Input tensor with shape (H, W, C) or (F, H, W, C)
-        height: Target height
-        width: Target width
-    Returns:
-        Tensor with shape (1, C, 1, height, width) for 3D input or (1, C, F, height, width) for 4D input
-    """
-    if tensor.ndim == 3:
-        tensor = rearrange(tensor, "h w c -> 1 c h w")
-    elif tensor.ndim == 4:
-        tensor = rearrange(tensor, "f h w c -> f c h w")
-    else:
-        raise ValueError(f"Expected input with 3 or 4 dimensions; got shape {tensor.shape}.")
-    _, _, src_h, src_w = tensor.shape
-    scale = max(height / src_h, width / src_w)
-    # Use ceil to avoid floating-point rounding causing new_h/new_w to be
-    # slightly smaller than target, which would result in negative crop offsets.
-    new_h = math.ceil(src_h * scale)
-    new_w = math.ceil(src_w * scale)
-    tensor = torch.nn.functional.interpolate(tensor, size=(new_h, new_w), mode="bilinear", align_corners=False)
-    crop_top = (new_h - height) // 2
-    crop_left = (new_w - width) // 2
-    tensor = tensor[:, :, crop_top : crop_top + height, crop_left : crop_left + width]
-    tensor = rearrange(tensor, "f c h w -> 1 c f h w")
-    return tensor
-def normalize_latent(latent: torch.Tensor, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
-    return (latent / 127.5 - 1.0).to(device=device, dtype=dtype)
-def load_image_conditioning(
-    image_path: str, height: int, width: int, dtype: torch.dtype, device: torch.device
-) -> torch.Tensor:
-    """
-    Loads an image from a path and preprocesses it for conditioning.
-    Note: The image is resized to the nearest multiple of 2 for compatibility with video codecs.
-    """
-    image = decode_image(image_path=image_path)
-    image = preprocess(image=image)
-    image = torch.tensor(image, dtype=torch.float32, device=device)
-    image = resize_and_center_crop(image, height, width)
-    image = normalize_latent(image, device, dtype)
-    return image
-def load_video_conditioning(
-    video_path: str, height: int, width: int, frame_cap: int, dtype: torch.dtype, device: torch.device
-) -> torch.Tensor:
-    """
-    Loads a video from a path and preprocesses it for conditioning.
-    Note: The video is resized to the nearest multiple of 2 for compatibility with video codecs.
-    """
-    frames = decode_video_from_file(path=video_path, frame_cap=frame_cap, device=device)
-    result = None
-    for f in frames:
-        frame = resize_and_center_crop(f.to(torch.float32), height, width)
-        frame = normalize_latent(frame, device, dtype)
-        result = frame if result is None else torch.cat([result, frame], dim=2)
-    return result
-def decode_image(image_path: str) -> np.ndarray:
-    image = Image.open(image_path)
-    np_array = np.array(image)[..., :3]
-    return np_array
-def _write_audio(
-    container: av.container.Container, audio_stream: av.audio.AudioStream, samples: torch.Tensor, audio_sample_rate: int
-) -> None:
-    if samples.ndim == 1:
-        samples = samples[:, None]
-    if samples.shape[1] != 2 and samples.shape[0] == 2:
-        samples = samples.T
-    if samples.shape[1] != 2:
-        raise ValueError(f"Expected samples with 2 channels; got shape {samples.shape}.")
-    # Convert to int16 packed for ingestion; resampler converts to encoder fmt.
-    if samples.dtype != torch.int16:
-        samples = torch.clip(samples, -1.0, 1.0)
-        samples = (samples * 32767.0).to(torch.int16)
-    frame_in = av.AudioFrame.from_ndarray(
-        samples.contiguous().reshape(1, -1).cpu().numpy(),
-        format="s16",
-        layout="stereo",
-    )
-    frame_in.sample_rate = audio_sample_rate
-    _resample_audio(container, audio_stream, frame_in)
-def _prepare_audio_stream(container: av.container.Container, audio_sample_rate: int) -> av.audio.AudioStream:
-    """
-    Prepare the audio stream for writing.
-    """
-    audio_stream = container.add_stream("aac", rate=audio_sample_rate)
-    audio_stream.codec_context.sample_rate = audio_sample_rate
-    audio_stream.codec_context.layout = "stereo"
-    audio_stream.codec_context.time_base = Fraction(1, audio_sample_rate)
-    return audio_stream
-def _resample_audio(
-    container: av.container.Container, audio_stream: av.audio.AudioStream, frame_in: av.AudioFrame
-) -> None:
-    cc = audio_stream.codec_context
-    # Use the encoder's format/layout/rate as the *target*
-    target_format = cc.format or "fltp"  # AAC → usually fltp
-    target_layout = cc.layout or "stereo"
-    target_rate = cc.sample_rate or frame_in.sample_rate
-    audio_resampler = av.audio.resampler.AudioResampler(
-        format=target_format,
-        layout=target_layout,
-        rate=target_rate,
-    )
-    audio_next_pts = 0
-    for rframe in audio_resampler.resample(frame_in):
-        if rframe.pts is None:
-            rframe.pts = audio_next_pts
-        audio_next_pts += rframe.samples
-        rframe.sample_rate = frame_in.sample_rate
-        container.mux(audio_stream.encode(rframe))
-    # flush audio encoder
-    for packet in audio_stream.encode():
-        container.mux(packet)
-def encode_video(
-    video: torch.Tensor | Iterator[torch.Tensor],
-    fps: int,
-    audio: torch.Tensor | None,
-    audio_sample_rate: int | None,
-    output_path: str,
-    video_chunks_number: int,
-) -> None:
-    if isinstance(video, torch.Tensor):
-        video = iter([video])
-    first_chunk = next(video)
-    _, height, width, _ = first_chunk.shape
-    container = av.open(output_path, mode="w")
-    stream = container.add_stream("libx264", rate=int(fps))
-    stream.width = width
-    stream.height = height
-    stream.pix_fmt = "yuv420p"
-    if audio is not None:
-        if audio_sample_rate is None:
-            raise ValueError("audio_sample_rate is required when audio is provided")
-        audio_stream = _prepare_audio_stream(container, audio_sample_rate)
-    def all_tiles(
-        first_chunk: torch.Tensor, tiles_generator: Generator[tuple[torch.Tensor, int], None, None]
-    ) -> Generator[tuple[torch.Tensor, int], None, None]:
-        yield first_chunk
-        yield from tiles_generator
-    for video_chunk in tqdm(all_tiles(first_chunk, video), total=video_chunks_number):
-        video_chunk_cpu = video_chunk.to("cpu").numpy()
-        for frame_array in video_chunk_cpu:
-            frame = av.VideoFrame.from_ndarray(frame_array, format="rgb24")
-            for packet in stream.encode(frame):
-                container.mux(packet)
-    # Flush encoder
-    for packet in stream.encode():
-        container.mux(packet)
-    if audio is not None:
-        _write_audio(container, audio_stream, audio, audio_sample_rate)
-    container.close()
-def decode_audio_from_file(path: str, device: torch.device) -> torch.Tensor | None:
-    container = av.open(path)
-    try:
-        audio = []
-        audio_stream = next(s for s in container.streams if s.type == "audio")
-        for frame in container.decode(audio_stream):
-            audio.append(torch.tensor(frame.to_ndarray(), dtype=torch.float32, device=device).unsqueeze(0))
-        container.close()
-        audio = torch.cat(audio)
-    except StopIteration:
-        audio = None
-    finally:
-        container.close()
-    return audio
-def decode_video_from_file(path: str, frame_cap: int, device: DeviceLikeType) -> Generator[torch.Tensor]:
-    container = av.open(path)
-    try:
-        video_stream = next(s for s in container.streams if s.type == "video")
-        for frame in container.decode(video_stream):
-            tensor = torch.tensor(frame.to_rgb().to_ndarray(), dtype=torch.uint8, device=device).unsqueeze(0)
-            yield tensor
-            frame_cap = frame_cap - 1
-            if frame_cap == 0:
-                break
-    finally:
-        container.close()
-def encode_single_frame(output_file: str, image_array: np.ndarray, crf: float) -> None:
-    container = av.open(output_file, "w", format="mp4")
-    try:
-        stream = container.add_stream("libx264", rate=1, options={"crf": str(crf), "preset": "veryfast"})
-        # Round to nearest multiple of 2 for compatibility with video codecs
-        height = image_array.shape[0] // 2 * 2
-        width = image_array.shape[1] // 2 * 2
-        image_array = image_array[:height, :width]
-        stream.height = height
-        stream.width = width
-        av_frame = av.VideoFrame.from_ndarray(image_array, format="rgb24").reformat(format="yuv420p")
-        container.mux(stream.encode(av_frame))
-        container.mux(stream.encode())
-    finally:
-        container.close()
-def decode_single_frame(video_file: str) -> np.array:
-    container = av.open(video_file)
-    try:
-        stream = next(s for s in container.streams if s.type == "video")
-        frame = next(container.decode(stream))
-    finally:
-        container.close()
-    return frame.to_ndarray(format="rgb24")
-def preprocess(image: np.array, crf: float = DEFAULT_IMAGE_CRF) -> np.array:
-    if crf == 0:
-        return image
-    with BytesIO() as output_file:
-        encode_single_frame(output_file, image, crf)
-        video_bytes = output_file.getvalue()
-    with BytesIO(video_bytes) as video_file:
-        image_array = decode_single_frame(video_file)
-    return image_array

packages/ltx-pipelines/ltx_pipelines/utils/model_ledger.py DELETED Viewed

@@ -1,275 +0,0 @@
-from dataclasses import replace
-import torch
-from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
-from ltx_core.loader.registry import DummyRegistry, Registry
-from ltx_core.loader.single_gpu_model_builder import SingleGPUModelBuilder as Builder
-from ltx_core.model.audio_vae import (
-    AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
-    VOCODER_COMFY_KEYS_FILTER,
-    AudioDecoder,
-    AudioDecoderConfigurator,
-    Vocoder,
-    VocoderConfigurator,
-)
-from ltx_core.model.transformer import (
-    LTXV_MODEL_COMFY_RENAMING_MAP,
-    LTXV_MODEL_COMFY_RENAMING_WITH_TRANSFORMER_LINEAR_DOWNCAST_MAP,
-    UPCAST_DURING_INFERENCE,
-    LTXModelConfigurator,
-    X0Model,
-)
-from ltx_core.model.upsampler import LatentUpsampler, LatentUpsamplerConfigurator
-from ltx_core.model.video_vae import (
-    VAE_DECODER_COMFY_KEYS_FILTER,
-    VAE_ENCODER_COMFY_KEYS_FILTER,
-    VideoDecoder,
-    VideoDecoderConfigurator,
-    VideoEncoder,
-    VideoEncoderConfigurator,
-)
-from ltx_core.text_encoders.gemma import (
-    AV_GEMMA_TEXT_ENCODER_KEY_OPS,
-    AVGemmaTextEncoderModel,
-    AVGemmaTextEncoderModelConfigurator,
-    module_ops_from_gemma_root,
-)
-from ltx_core.model.audio_vae import (
-    AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
-    VOCODER_COMFY_KEYS_FILTER,
-    AudioDecoder,
-    AudioDecoderConfigurator,
-    Vocoder,
-    VocoderConfigurator,
-    AudioEncoder,
-)
-from ltx_core.model.audio_vae.model_configurator import (
-    AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
-    AudioEncoderConfigurator,
-)
-class ModelLedger:
-    """
-    Central coordinator for loading and building     models used in an LTX pipeline.
-    The ledger wires together multiple model builders (transformer, video VAE encoder/decoder,
-    audio VAE decoder, vocoder, text encoder, and optional latent upsampler) and exposes
-    factory methods for constructing model instances.
-    ### Model Building
-    Each model method (e.g. :meth:`transformer`, :meth:`video_decoder`, :meth:`text_encoder`)
-    constructs a new model instance on each call. The builder uses the
-    :class:`~ltx_core.loader.registry.Registry` to load weights from the checkpoint,
-    instantiates the model with the configured ``dtype``, and moves it to ``self.device``.
-    .. note::
-        Models are **not cached**. Each call to a model method creates a new instance.
-        Callers are responsible for storing references to models they wish to reuse
-        and for freeing GPU memory (e.g. by deleting references and calling
-        ``torch.cuda.empty_cache()``).
-    ### Constructor parameters
-    dtype:
-        Torch dtype used when constructing all models (e.g. ``torch.bfloat16``).
-    device:
-        Target device to which models are moved after construction (e.g. ``torch.device("cuda")``).
-    checkpoint_path:
-        Path to a checkpoint directory or file containing the core model weights
-        (transformer, video VAE, audio VAE, text encoder, vocoder). If ``None``, the
-        corresponding builders are not created and calling those methods will raise
-        a :class:`ValueError`.
-    gemma_root_path:
-        Base path to Gemma-compatible CLIP/text encoder weights. Required to
-        initialize the text encoder builder; if omitted, :meth:`text_encoder` cannot be used.
-    spatial_upsampler_path:
-        Optional path to a latent upsampler checkpoint. If provided, the
-        :meth:`spatial_upsampler` method becomes available; otherwise calling it raises
-        a :class:`ValueError`.
-    loras:
-        Optional collection of LoRA configurations (paths, strengths, and key operations)
-        that are applied on top of the base transformer weights when building the model.
-    registry:
-        Optional :class:`Registry` instance for weight caching across builders.
-        Defaults to :class:`DummyRegistry` which performs no cross-builder caching.
-    fp8transformer:
-        If ``True``, builds the transformer with FP8 quantization and upcasting during inference.
-    ### Creating Variants
-    Use :meth:`with_loras` to create a new ``ModelLedger`` instance that includes
-    additional LoRA configurations while sharing the same registry for weight caching.
-    """
-    def __init__(
-        self,
-        dtype: torch.dtype,
-        device: torch.device,
-        checkpoint_path: str | None = None,
-        gemma_root_path: str | None = None,
-        spatial_upsampler_path: str | None = None,
-        loras: LoraPathStrengthAndSDOps | None = None,
-        registry: Registry | None = None,
-        fp8transformer: bool = False,
-        local_files_only: bool = True
-    ):
-        self.dtype = dtype
-        self.device = device
-        self.checkpoint_path = checkpoint_path
-        self.gemma_root_path = gemma_root_path
-        self.spatial_upsampler_path = spatial_upsampler_path
-        self.loras = loras or ()
-        self.registry = registry or DummyRegistry()
-        self.fp8transformer = fp8transformer
-        self.local_files_only = local_files_only
-        self.build_model_builders()
-    def build_model_builders(self) -> None:
-        if self.checkpoint_path is not None:
-            self.transformer_builder = Builder(
-                model_path=self.checkpoint_path,
-                model_class_configurator=LTXModelConfigurator,
-                model_sd_ops=LTXV_MODEL_COMFY_RENAMING_MAP,
-                loras=tuple(self.loras),
-                registry=self.registry,
-            )
-            self.vae_decoder_builder = Builder(
-                model_path=self.checkpoint_path,
-                model_class_configurator=VideoDecoderConfigurator,
-                model_sd_ops=VAE_DECODER_COMFY_KEYS_FILTER,
-                registry=self.registry,
-            )
-            self.vae_encoder_builder = Builder(
-                model_path=self.checkpoint_path,
-                model_class_configurator=VideoEncoderConfigurator,
-                model_sd_ops=VAE_ENCODER_COMFY_KEYS_FILTER,
-                registry=self.registry,
-            )
-            self.audio_decoder_builder = Builder(
-                model_path=self.checkpoint_path,
-                model_class_configurator=AudioDecoderConfigurator,
-                model_sd_ops=AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
-                registry=self.registry,
-            )
-            self.vocoder_builder = Builder(
-                model_path=self.checkpoint_path,
-                model_class_configurator=VocoderConfigurator,
-                model_sd_ops=VOCODER_COMFY_KEYS_FILTER,
-                registry=self.registry,
-            )
-            self.audio_encoder_builder = Builder(
-                model_path=self.checkpoint_path,
-                model_class_configurator=AudioEncoderConfigurator,
-                model_sd_ops=AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
-                registry=self.registry,
-            )
-            if self.gemma_root_path is not None:
-                self.text_encoder_builder = Builder(
-                    model_path=self.checkpoint_path,
-                    model_class_configurator=AVGemmaTextEncoderModelConfigurator,
-                    model_sd_ops=AV_GEMMA_TEXT_ENCODER_KEY_OPS,
-                    registry=self.registry,
-                    module_ops=module_ops_from_gemma_root(self.gemma_root_path,self.local_files_only),
-                )
-        if self.spatial_upsampler_path is not None:
-            self.upsampler_builder = Builder(
-                model_path=self.spatial_upsampler_path,
-                model_class_configurator=LatentUpsamplerConfigurator,
-                registry=self.registry,
-            )
-    def _target_device(self) -> torch.device:
-        if isinstance(self.registry, DummyRegistry) or self.registry is None:
-            return self.device
-        else:
-            return torch.device("cpu")
-    def with_loras(self, loras: LoraPathStrengthAndSDOps) -> "ModelLedger":
-        return ModelLedger(
-            dtype=self.dtype,
-            device=self.device,
-            checkpoint_path=self.checkpoint_path,
-            gemma_root_path=self.gemma_root_path,
-            spatial_upsampler_path=self.spatial_upsampler_path,
-            loras=(*self.loras, *loras),
-            registry=self.registry,
-            fp8transformer=self.fp8transformer,
-        )
-    def transformer(self) -> X0Model:
-        if not hasattr(self, "transformer_builder"):
-            raise ValueError(
-                "Transformer not initialized. Please provide a checkpoint path to the ModelLedger constructor."
-            )
-        if self.fp8transformer:
-            fp8_builder = replace(
-                self.transformer_builder,
-                module_ops=(UPCAST_DURING_INFERENCE,),
-                model_sd_ops=LTXV_MODEL_COMFY_RENAMING_WITH_TRANSFORMER_LINEAR_DOWNCAST_MAP,
-            )
-            return X0Model(fp8_builder.build(device=self._target_device())).to(self.device).eval()
-        else:
-            return (
-                X0Model(self.transformer_builder.build(device=self._target_device(), dtype=self.dtype))
-                .to(self.device)
-                .eval()
-            )
-    def audio_encoder(self) -> AudioEncoder:
-        if not hasattr(self, "audio_encoder_builder"):
-            raise ValueError(
-                "Audio encoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
-            )
-        return self.audio_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
-    def video_decoder(self) -> VideoDecoder:
-        if not hasattr(self, "vae_decoder_builder"):
-            raise ValueError(
-                "Video decoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
-            )
-        return self.vae_decoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
-    def video_encoder(self) -> VideoEncoder:
-        if not hasattr(self, "vae_encoder_builder"):
-            raise ValueError(
-                "Video encoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
-            )
-        return self.vae_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
-    def text_encoder(self) -> AVGemmaTextEncoderModel:
-        if not hasattr(self, "text_encoder_builder"):
-            raise ValueError(
-                "Text encoder not initialized. Please provide a checkpoint path and gemma root path to the "
-                "ModelLedger constructor."
-            )
-        return self.text_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
-    def audio_decoder(self) -> AudioDecoder:
-        if not hasattr(self, "audio_decoder_builder"):
-            raise ValueError(
-                "Audio decoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
-            )
-        return self.audio_decoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
-    def vocoder(self) -> Vocoder:
-        if not hasattr(self, "vocoder_builder"):
-            raise ValueError(
-                "Vocoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
-            )
-        return self.vocoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
-    def spatial_upsampler(self) -> LatentUpsampler:
-        if not hasattr(self, "upsampler_builder"):
-            raise ValueError("Upsampler not initialized. Please provide upsampler path to the ModelLedger constructor.")
-        return self.upsampler_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()

packages/ltx-pipelines/ltx_pipelines/utils/types.py DELETED Viewed

@@ -1,73 +0,0 @@
-from typing import Protocol
-import torch
-from ltx_core.components.patchifiers import AudioPatchifier, VideoLatentPatchifier
-from ltx_core.components.protocols import DiffusionStepProtocol
-from ltx_core.types import LatentState
-from ltx_pipelines.utils.constants import VIDEO_LATENT_CHANNELS, VIDEO_SCALE_FACTORS
-class PipelineComponents:
-    """
-    Container class for pipeline components used throughout the LTX pipelines.
-    Attributes:
-        dtype (torch.dtype): Default torch dtype for tensors in the pipeline.
-        device (torch.device): Target device to place tensors and modules on.
-        video_scale_factors (SpatioTemporalScaleFactors): Scale factors (T, H, W) for VAE latent space.
-        video_latent_channels (int): Number of channels in the video latent representation.
-        video_patchifier (VideoLatentPatchifier): Patchifier instance for video latents.
-        audio_patchifier (AudioPatchifier): Patchifier instance for audio latents.
-    """
-    def __init__(
-        self,
-        dtype: torch.dtype,
-        device: torch.device,
-    ):
-        self.dtype = dtype
-        self.device = device
-        self.video_scale_factors = VIDEO_SCALE_FACTORS
-        self.video_latent_channels = VIDEO_LATENT_CHANNELS
-        self.video_patchifier = VideoLatentPatchifier(patch_size=1)
-        self.audio_patchifier = AudioPatchifier(patch_size=1)
-class DenoisingFunc(Protocol):
-    """
-    Protocol for a denoising function used in the LTX pipeline.
-    Args:
-        video_state (LatentState): The current latent state for video.
-        audio_state (LatentState): The current latent state for audio.
-        sigmas (torch.Tensor): A 1D tensor of sigma values for each diffusion step.
-        step_index (int): Index of the current denoising step.
-    Returns:
-        tuple[torch.Tensor, torch.Tensor]: The denoised video and audio tensors.
-    """
-    def __call__(
-        self, video_state: LatentState, audio_state: LatentState, sigmas: torch.Tensor, step_index: int
-    ) -> tuple[torch.Tensor, torch.Tensor]: ...
-class DenoisingLoopFunc(Protocol):
-    """
-    Protocol for a denoising loop function used in the LTX pipeline.
-    Args:
-        sigmas (torch.Tensor): A 1D tensor of sigma values for each diffusion step.
-        video_state (LatentState): The current latent state for video.
-        audio_state (LatentState): The current latent state for audio.
-        stepper (DiffusionStepProtocol): The diffusion step protocol to use.
-    Returns:
-        tuple[LatentState, LatentState]: The denoised video and audio latent states.
-    """
-    def __call__(
-        self,
-        sigmas: torch.Tensor,
-        video_state: LatentState,
-        audio_state: LatentState,
-        stepper: DiffusionStepProtocol,
-    ) -> tuple[torch.Tensor, torch.Tensor]: ...